These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / drivers / staging / rdma / hfi1 / chip.c
diff --git a/kernel/drivers/staging/rdma/hfi1/chip.c b/kernel/drivers/staging/rdma/hfi1/chip.c
new file mode 100644 (file)
index 0000000..e489819
--- /dev/null
@@ -0,0 +1,10886 @@
+/*
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *  - Neither the name of Intel Corporation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/*
+ * This file contains all of the code that is specific to the HFI chip
+ */
+
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+
+#include "hfi.h"
+#include "trace.h"
+#include "mad.h"
+#include "pio.h"
+#include "sdma.h"
+#include "eprom.h"
+
+#define NUM_IB_PORTS 1
+
+uint kdeth_qp;
+module_param_named(kdeth_qp, kdeth_qp, uint, S_IRUGO);
+MODULE_PARM_DESC(kdeth_qp, "Set the KDETH queue pair prefix");
+
+uint num_vls = HFI1_MAX_VLS_SUPPORTED;
+module_param(num_vls, uint, S_IRUGO);
+MODULE_PARM_DESC(num_vls, "Set number of Virtual Lanes to use (1-8)");
+
+/*
+ * Default time to aggregate two 10K packets from the idle state
+ * (timer not running). The timer starts at the end of the first packet,
+ * so only the time for one 10K packet and header plus a bit extra is needed.
+ * 10 * 1024 + 64 header byte = 10304 byte
+ * 10304 byte / 12.5 GB/s = 824.32ns
+ */
+uint rcv_intr_timeout = (824 + 16); /* 16 is for coalescing interrupt */
+module_param(rcv_intr_timeout, uint, S_IRUGO);
+MODULE_PARM_DESC(rcv_intr_timeout, "Receive interrupt mitigation timeout in ns");
+
+uint rcv_intr_count = 16; /* same as qib */
+module_param(rcv_intr_count, uint, S_IRUGO);
+MODULE_PARM_DESC(rcv_intr_count, "Receive interrupt mitigation count");
+
+ushort link_crc_mask = SUPPORTED_CRCS;
+module_param(link_crc_mask, ushort, S_IRUGO);
+MODULE_PARM_DESC(link_crc_mask, "CRCs to use on the link");
+
+uint loopback;
+module_param_named(loopback, loopback, uint, S_IRUGO);
+MODULE_PARM_DESC(loopback, "Put into loopback mode (1 = serdes, 3 = external cable");
+
+/* Other driver tunables */
+uint rcv_intr_dynamic = 1; /* enable dynamic mode for rcv int mitigation*/
+static ushort crc_14b_sideband = 1;
+static uint use_flr = 1;
+uint quick_linkup; /* skip LNI */
+
+struct flag_table {
+       u64 flag;       /* the flag */
+       char *str;      /* description string */
+       u16 extra;      /* extra information */
+       u16 unused0;
+       u32 unused1;
+};
+
+/* str must be a string constant */
+#define FLAG_ENTRY(str, extra, flag) {flag, str, extra}
+#define FLAG_ENTRY0(str, flag) {flag, str, 0}
+
+/* Send Error Consequences */
+#define SEC_WRITE_DROPPED      0x1
+#define SEC_PACKET_DROPPED     0x2
+#define SEC_SC_HALTED          0x4     /* per-context only */
+#define SEC_SPC_FREEZE         0x8     /* per-HFI only */
+
+#define VL15CTXT                  1
+#define MIN_KERNEL_KCTXTS         2
+#define NUM_MAP_REGS             32
+
+/* Bit offset into the GUID which carries HFI id information */
+#define GUID_HFI_INDEX_SHIFT     39
+
+/* extract the emulation revision */
+#define emulator_rev(dd) ((dd)->irev >> 8)
+/* parallel and serial emulation versions are 3 and 4 respectively */
+#define is_emulator_p(dd) ((((dd)->irev) & 0xf) == 3)
+#define is_emulator_s(dd) ((((dd)->irev) & 0xf) == 4)
+
+/* RSM fields */
+
+/* packet type */
+#define IB_PACKET_TYPE         2ull
+#define QW_SHIFT               6ull
+/* QPN[7..1] */
+#define QPN_WIDTH              7ull
+
+/* LRH.BTH: QW 0, OFFSET 48 - for match */
+#define LRH_BTH_QW             0ull
+#define LRH_BTH_BIT_OFFSET     48ull
+#define LRH_BTH_OFFSET(off)    ((LRH_BTH_QW << QW_SHIFT) | (off))
+#define LRH_BTH_MATCH_OFFSET   LRH_BTH_OFFSET(LRH_BTH_BIT_OFFSET)
+#define LRH_BTH_SELECT
+#define LRH_BTH_MASK           3ull
+#define LRH_BTH_VALUE          2ull
+
+/* LRH.SC[3..0] QW 0, OFFSET 56 - for match */
+#define LRH_SC_QW              0ull
+#define LRH_SC_BIT_OFFSET      56ull
+#define LRH_SC_OFFSET(off)     ((LRH_SC_QW << QW_SHIFT) | (off))
+#define LRH_SC_MATCH_OFFSET    LRH_SC_OFFSET(LRH_SC_BIT_OFFSET)
+#define LRH_SC_MASK            128ull
+#define LRH_SC_VALUE           0ull
+
+/* SC[n..0] QW 0, OFFSET 60 - for select */
+#define LRH_SC_SELECT_OFFSET  ((LRH_SC_QW << QW_SHIFT) | (60ull))
+
+/* QPN[m+n:1] QW 1, OFFSET 1 */
+#define QPN_SELECT_OFFSET      ((1ull << QW_SHIFT) | (1ull))
+
+/* defines to build power on SC2VL table */
+#define SC2VL_VAL( \
+       num, \
+       sc0, sc0val, \
+       sc1, sc1val, \
+       sc2, sc2val, \
+       sc3, sc3val, \
+       sc4, sc4val, \
+       sc5, sc5val, \
+       sc6, sc6val, \
+       sc7, sc7val) \
+( \
+       ((u64)(sc0val) << SEND_SC2VLT##num##_SC##sc0##_SHIFT) | \
+       ((u64)(sc1val) << SEND_SC2VLT##num##_SC##sc1##_SHIFT) | \
+       ((u64)(sc2val) << SEND_SC2VLT##num##_SC##sc2##_SHIFT) | \
+       ((u64)(sc3val) << SEND_SC2VLT##num##_SC##sc3##_SHIFT) | \
+       ((u64)(sc4val) << SEND_SC2VLT##num##_SC##sc4##_SHIFT) | \
+       ((u64)(sc5val) << SEND_SC2VLT##num##_SC##sc5##_SHIFT) | \
+       ((u64)(sc6val) << SEND_SC2VLT##num##_SC##sc6##_SHIFT) | \
+       ((u64)(sc7val) << SEND_SC2VLT##num##_SC##sc7##_SHIFT)   \
+)
+
+#define DC_SC_VL_VAL( \
+       range, \
+       e0, e0val, \
+       e1, e1val, \
+       e2, e2val, \
+       e3, e3val, \
+       e4, e4val, \
+       e5, e5val, \
+       e6, e6val, \
+       e7, e7val, \
+       e8, e8val, \
+       e9, e9val, \
+       e10, e10val, \
+       e11, e11val, \
+       e12, e12val, \
+       e13, e13val, \
+       e14, e14val, \
+       e15, e15val) \
+( \
+       ((u64)(e0val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e0##_SHIFT) | \
+       ((u64)(e1val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e1##_SHIFT) | \
+       ((u64)(e2val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e2##_SHIFT) | \
+       ((u64)(e3val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e3##_SHIFT) | \
+       ((u64)(e4val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e4##_SHIFT) | \
+       ((u64)(e5val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e5##_SHIFT) | \
+       ((u64)(e6val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e6##_SHIFT) | \
+       ((u64)(e7val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e7##_SHIFT) | \
+       ((u64)(e8val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e8##_SHIFT) | \
+       ((u64)(e9val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e9##_SHIFT) | \
+       ((u64)(e10val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e10##_SHIFT) | \
+       ((u64)(e11val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e11##_SHIFT) | \
+       ((u64)(e12val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e12##_SHIFT) | \
+       ((u64)(e13val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e13##_SHIFT) | \
+       ((u64)(e14val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e14##_SHIFT) | \
+       ((u64)(e15val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e15##_SHIFT) \
+)
+
+/* all CceStatus sub-block freeze bits */
+#define ALL_FROZE (CCE_STATUS_SDMA_FROZE_SMASK \
+                       | CCE_STATUS_RXE_FROZE_SMASK \
+                       | CCE_STATUS_TXE_FROZE_SMASK \
+                       | CCE_STATUS_TXE_PIO_FROZE_SMASK)
+/* all CceStatus sub-block TXE pause bits */
+#define ALL_TXE_PAUSE (CCE_STATUS_TXE_PIO_PAUSED_SMASK \
+                       | CCE_STATUS_TXE_PAUSED_SMASK \
+                       | CCE_STATUS_SDMA_PAUSED_SMASK)
+/* all CceStatus sub-block RXE pause bits */
+#define ALL_RXE_PAUSE CCE_STATUS_RXE_PAUSED_SMASK
+
+/*
+ * CCE Error flags.
+ */
+static struct flag_table cce_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY0("CceCsrParityErr",
+               CCE_ERR_STATUS_CCE_CSR_PARITY_ERR_SMASK),
+/* 1*/ FLAG_ENTRY0("CceCsrReadBadAddrErr",
+               CCE_ERR_STATUS_CCE_CSR_READ_BAD_ADDR_ERR_SMASK),
+/* 2*/ FLAG_ENTRY0("CceCsrWriteBadAddrErr",
+               CCE_ERR_STATUS_CCE_CSR_WRITE_BAD_ADDR_ERR_SMASK),
+/* 3*/ FLAG_ENTRY0("CceTrgtAsyncFifoParityErr",
+               CCE_ERR_STATUS_CCE_TRGT_ASYNC_FIFO_PARITY_ERR_SMASK),
+/* 4*/ FLAG_ENTRY0("CceTrgtAccessErr",
+               CCE_ERR_STATUS_CCE_TRGT_ACCESS_ERR_SMASK),
+/* 5*/ FLAG_ENTRY0("CceRspdDataParityErr",
+               CCE_ERR_STATUS_CCE_RSPD_DATA_PARITY_ERR_SMASK),
+/* 6*/ FLAG_ENTRY0("CceCli0AsyncFifoParityErr",
+               CCE_ERR_STATUS_CCE_CLI0_ASYNC_FIFO_PARITY_ERR_SMASK),
+/* 7*/ FLAG_ENTRY0("CceCsrCfgBusParityErr",
+               CCE_ERR_STATUS_CCE_CSR_CFG_BUS_PARITY_ERR_SMASK),
+/* 8*/ FLAG_ENTRY0("CceCli2AsyncFifoParityErr",
+               CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK),
+/* 9*/ FLAG_ENTRY0("CceCli1AsyncFifoPioCrdtParityErr",
+           CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR_SMASK),
+/*10*/ FLAG_ENTRY0("CceCli1AsyncFifoPioCrdtParityErr",
+           CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR_SMASK),
+/*11*/ FLAG_ENTRY0("CceCli1AsyncFifoRxdmaParityError",
+           CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERROR_SMASK),
+/*12*/ FLAG_ENTRY0("CceCli1AsyncFifoDbgParityError",
+               CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERROR_SMASK),
+/*13*/ FLAG_ENTRY0("PcicRetryMemCorErr",
+               CCE_ERR_STATUS_PCIC_RETRY_MEM_COR_ERR_SMASK),
+/*14*/ FLAG_ENTRY0("PcicRetryMemCorErr",
+               CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_COR_ERR_SMASK),
+/*15*/ FLAG_ENTRY0("PcicPostHdQCorErr",
+               CCE_ERR_STATUS_PCIC_POST_HD_QCOR_ERR_SMASK),
+/*16*/ FLAG_ENTRY0("PcicPostHdQCorErr",
+               CCE_ERR_STATUS_PCIC_POST_DAT_QCOR_ERR_SMASK),
+/*17*/ FLAG_ENTRY0("PcicPostHdQCorErr",
+               CCE_ERR_STATUS_PCIC_CPL_HD_QCOR_ERR_SMASK),
+/*18*/ FLAG_ENTRY0("PcicCplDatQCorErr",
+               CCE_ERR_STATUS_PCIC_CPL_DAT_QCOR_ERR_SMASK),
+/*19*/ FLAG_ENTRY0("PcicNPostHQParityErr",
+               CCE_ERR_STATUS_PCIC_NPOST_HQ_PARITY_ERR_SMASK),
+/*20*/ FLAG_ENTRY0("PcicNPostDatQParityErr",
+               CCE_ERR_STATUS_PCIC_NPOST_DAT_QPARITY_ERR_SMASK),
+/*21*/ FLAG_ENTRY0("PcicRetryMemUncErr",
+               CCE_ERR_STATUS_PCIC_RETRY_MEM_UNC_ERR_SMASK),
+/*22*/ FLAG_ENTRY0("PcicRetrySotMemUncErr",
+               CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_UNC_ERR_SMASK),
+/*23*/ FLAG_ENTRY0("PcicPostHdQUncErr",
+               CCE_ERR_STATUS_PCIC_POST_HD_QUNC_ERR_SMASK),
+/*24*/ FLAG_ENTRY0("PcicPostDatQUncErr",
+               CCE_ERR_STATUS_PCIC_POST_DAT_QUNC_ERR_SMASK),
+/*25*/ FLAG_ENTRY0("PcicCplHdQUncErr",
+               CCE_ERR_STATUS_PCIC_CPL_HD_QUNC_ERR_SMASK),
+/*26*/ FLAG_ENTRY0("PcicCplDatQUncErr",
+               CCE_ERR_STATUS_PCIC_CPL_DAT_QUNC_ERR_SMASK),
+/*27*/ FLAG_ENTRY0("PcicTransmitFrontParityErr",
+               CCE_ERR_STATUS_PCIC_TRANSMIT_FRONT_PARITY_ERR_SMASK),
+/*28*/ FLAG_ENTRY0("PcicTransmitBackParityErr",
+               CCE_ERR_STATUS_PCIC_TRANSMIT_BACK_PARITY_ERR_SMASK),
+/*29*/ FLAG_ENTRY0("PcicReceiveParityErr",
+               CCE_ERR_STATUS_PCIC_RECEIVE_PARITY_ERR_SMASK),
+/*30*/ FLAG_ENTRY0("CceTrgtCplTimeoutErr",
+               CCE_ERR_STATUS_CCE_TRGT_CPL_TIMEOUT_ERR_SMASK),
+/*31*/ FLAG_ENTRY0("LATriggered",
+               CCE_ERR_STATUS_LA_TRIGGERED_SMASK),
+/*32*/ FLAG_ENTRY0("CceSegReadBadAddrErr",
+               CCE_ERR_STATUS_CCE_SEG_READ_BAD_ADDR_ERR_SMASK),
+/*33*/ FLAG_ENTRY0("CceSegWriteBadAddrErr",
+               CCE_ERR_STATUS_CCE_SEG_WRITE_BAD_ADDR_ERR_SMASK),
+/*34*/ FLAG_ENTRY0("CceRcplAsyncFifoParityErr",
+               CCE_ERR_STATUS_CCE_RCPL_ASYNC_FIFO_PARITY_ERR_SMASK),
+/*35*/ FLAG_ENTRY0("CceRxdmaConvFifoParityErr",
+               CCE_ERR_STATUS_CCE_RXDMA_CONV_FIFO_PARITY_ERR_SMASK),
+/*36*/ FLAG_ENTRY0("CceMsixTableCorErr",
+               CCE_ERR_STATUS_CCE_MSIX_TABLE_COR_ERR_SMASK),
+/*37*/ FLAG_ENTRY0("CceMsixTableUncErr",
+               CCE_ERR_STATUS_CCE_MSIX_TABLE_UNC_ERR_SMASK),
+/*38*/ FLAG_ENTRY0("CceIntMapCorErr",
+               CCE_ERR_STATUS_CCE_INT_MAP_COR_ERR_SMASK),
+/*39*/ FLAG_ENTRY0("CceIntMapUncErr",
+               CCE_ERR_STATUS_CCE_INT_MAP_UNC_ERR_SMASK),
+/*40*/ FLAG_ENTRY0("CceMsixCsrParityErr",
+               CCE_ERR_STATUS_CCE_MSIX_CSR_PARITY_ERR_SMASK),
+/*41-63 reserved*/
+};
+
+/*
+ * Misc Error flags
+ */
+#define MES(text) MISC_ERR_STATUS_MISC_##text##_ERR_SMASK
+static struct flag_table misc_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY0("CSR_PARITY", MES(CSR_PARITY)),
+/* 1*/ FLAG_ENTRY0("CSR_READ_BAD_ADDR", MES(CSR_READ_BAD_ADDR)),
+/* 2*/ FLAG_ENTRY0("CSR_WRITE_BAD_ADDR", MES(CSR_WRITE_BAD_ADDR)),
+/* 3*/ FLAG_ENTRY0("SBUS_WRITE_FAILED", MES(SBUS_WRITE_FAILED)),
+/* 4*/ FLAG_ENTRY0("KEY_MISMATCH", MES(KEY_MISMATCH)),
+/* 5*/ FLAG_ENTRY0("FW_AUTH_FAILED", MES(FW_AUTH_FAILED)),
+/* 6*/ FLAG_ENTRY0("EFUSE_CSR_PARITY", MES(EFUSE_CSR_PARITY)),
+/* 7*/ FLAG_ENTRY0("EFUSE_READ_BAD_ADDR", MES(EFUSE_READ_BAD_ADDR)),
+/* 8*/ FLAG_ENTRY0("EFUSE_WRITE", MES(EFUSE_WRITE)),
+/* 9*/ FLAG_ENTRY0("EFUSE_DONE_PARITY", MES(EFUSE_DONE_PARITY)),
+/*10*/ FLAG_ENTRY0("INVALID_EEP_CMD", MES(INVALID_EEP_CMD)),
+/*11*/ FLAG_ENTRY0("MBIST_FAIL", MES(MBIST_FAIL)),
+/*12*/ FLAG_ENTRY0("PLL_LOCK_FAIL", MES(PLL_LOCK_FAIL))
+};
+
+/*
+ * TXE PIO Error flags and consequences
+ */
+static struct flag_table pio_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY("PioWriteBadCtxt",
+       SEC_WRITE_DROPPED,
+       SEND_PIO_ERR_STATUS_PIO_WRITE_BAD_CTXT_ERR_SMASK),
+/* 1*/ FLAG_ENTRY("PioWriteAddrParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK),
+/* 2*/ FLAG_ENTRY("PioCsrParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK),
+/* 3*/ FLAG_ENTRY("PioSbMemFifo0",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK),
+/* 4*/ FLAG_ENTRY("PioSbMemFifo1",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK),
+/* 5*/ FLAG_ENTRY("PioPccFifoParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK),
+/* 6*/ FLAG_ENTRY("PioPecFifoParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK),
+/* 7*/ FLAG_ENTRY("PioSbrdctlCrrelParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK),
+/* 8*/ FLAG_ENTRY("PioSbrdctrlCrrelFifoParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK),
+/* 9*/ FLAG_ENTRY("PioPktEvictFifoParityErr",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK),
+/*10*/ FLAG_ENTRY("PioSmPktResetParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK),
+/*11*/ FLAG_ENTRY("PioVlLenMemBank0Unc",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK),
+/*12*/ FLAG_ENTRY("PioVlLenMemBank1Unc",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK),
+/*13*/ FLAG_ENTRY("PioVlLenMemBank0Cor",
+       0,
+       SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_COR_ERR_SMASK),
+/*14*/ FLAG_ENTRY("PioVlLenMemBank1Cor",
+       0,
+       SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_COR_ERR_SMASK),
+/*15*/ FLAG_ENTRY("PioCreditRetFifoParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK),
+/*16*/ FLAG_ENTRY("PioPpmcPblFifo",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK),
+/*17*/ FLAG_ENTRY("PioInitSmIn",
+       0,
+       SEND_PIO_ERR_STATUS_PIO_INIT_SM_IN_ERR_SMASK),
+/*18*/ FLAG_ENTRY("PioPktEvictSmOrArbSm",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK),
+/*19*/ FLAG_ENTRY("PioHostAddrMemUnc",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK),
+/*20*/ FLAG_ENTRY("PioHostAddrMemCor",
+       0,
+       SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_COR_ERR_SMASK),
+/*21*/ FLAG_ENTRY("PioWriteDataParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK),
+/*22*/ FLAG_ENTRY("PioStateMachine",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK),
+/*23*/ FLAG_ENTRY("PioWriteQwValidParity",
+       SEC_WRITE_DROPPED|SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK),
+/*24*/ FLAG_ENTRY("PioBlockQwCountParity",
+       SEC_WRITE_DROPPED|SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK),
+/*25*/ FLAG_ENTRY("PioVlfVlLenParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK),
+/*26*/ FLAG_ENTRY("PioVlfSopParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK),
+/*27*/ FLAG_ENTRY("PioVlFifoParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK),
+/*28*/ FLAG_ENTRY("PioPpmcBqcMemParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK),
+/*29*/ FLAG_ENTRY("PioPpmcSopLen",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK),
+/*30-31 reserved*/
+/*32*/ FLAG_ENTRY("PioCurrentFreeCntParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK),
+/*33*/ FLAG_ENTRY("PioLastReturnedCntParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK),
+/*34*/ FLAG_ENTRY("PioPccSopHeadParity",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK),
+/*35*/ FLAG_ENTRY("PioPecSopHeadParityErr",
+       SEC_SPC_FREEZE,
+       SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK),
+/*36-63 reserved*/
+};
+
+/* TXE PIO errors that cause an SPC freeze */
+#define ALL_PIO_FREEZE_ERR \
+       (SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK \
+       | SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK)
+
+/*
+ * TXE SDMA Error flags
+ */
+static struct flag_table sdma_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY0("SDmaRpyTagErr",
+               SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK),
+/* 1*/ FLAG_ENTRY0("SDmaCsrParityErr",
+               SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK),
+/* 2*/ FLAG_ENTRY0("SDmaPcieReqTrackingUncErr",
+               SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK),
+/* 3*/ FLAG_ENTRY0("SDmaPcieReqTrackingCorErr",
+               SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_COR_ERR_SMASK),
+/*04-63 reserved*/
+};
+
+/* TXE SDMA errors that cause an SPC freeze */
+#define ALL_SDMA_FREEZE_ERR  \
+               (SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK \
+               | SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK \
+               | SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK)
+
+/*
+ * TXE Egress Error flags
+ */
+#define SEES(text) SEND_EGRESS_ERR_STATUS_##text##_ERR_SMASK
+static struct flag_table egress_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY0("TxPktIntegrityMemCorErr", SEES(TX_PKT_INTEGRITY_MEM_COR)),
+/* 1*/ FLAG_ENTRY0("TxPktIntegrityMemUncErr", SEES(TX_PKT_INTEGRITY_MEM_UNC)),
+/* 2 reserved */
+/* 3*/ FLAG_ENTRY0("TxEgressFifoUnderrunOrParityErr",
+               SEES(TX_EGRESS_FIFO_UNDERRUN_OR_PARITY)),
+/* 4*/ FLAG_ENTRY0("TxLinkdownErr", SEES(TX_LINKDOWN)),
+/* 5*/ FLAG_ENTRY0("TxIncorrectLinkStateErr", SEES(TX_INCORRECT_LINK_STATE)),
+/* 6 reserved */
+/* 7*/ FLAG_ENTRY0("TxPioLaunchIntfParityErr",
+               SEES(TX_PIO_LAUNCH_INTF_PARITY)),
+/* 8*/ FLAG_ENTRY0("TxSdmaLaunchIntfParityErr",
+               SEES(TX_SDMA_LAUNCH_INTF_PARITY)),
+/* 9-10 reserved */
+/*11*/ FLAG_ENTRY0("TxSbrdCtlStateMachineParityErr",
+               SEES(TX_SBRD_CTL_STATE_MACHINE_PARITY)),
+/*12*/ FLAG_ENTRY0("TxIllegalVLErr", SEES(TX_ILLEGAL_VL)),
+/*13*/ FLAG_ENTRY0("TxLaunchCsrParityErr", SEES(TX_LAUNCH_CSR_PARITY)),
+/*14*/ FLAG_ENTRY0("TxSbrdCtlCsrParityErr", SEES(TX_SBRD_CTL_CSR_PARITY)),
+/*15*/ FLAG_ENTRY0("TxConfigParityErr", SEES(TX_CONFIG_PARITY)),
+/*16*/ FLAG_ENTRY0("TxSdma0DisallowedPacketErr",
+               SEES(TX_SDMA0_DISALLOWED_PACKET)),
+/*17*/ FLAG_ENTRY0("TxSdma1DisallowedPacketErr",
+               SEES(TX_SDMA1_DISALLOWED_PACKET)),
+/*18*/ FLAG_ENTRY0("TxSdma2DisallowedPacketErr",
+               SEES(TX_SDMA2_DISALLOWED_PACKET)),
+/*19*/ FLAG_ENTRY0("TxSdma3DisallowedPacketErr",
+               SEES(TX_SDMA3_DISALLOWED_PACKET)),
+/*20*/ FLAG_ENTRY0("TxSdma4DisallowedPacketErr",
+               SEES(TX_SDMA4_DISALLOWED_PACKET)),
+/*21*/ FLAG_ENTRY0("TxSdma5DisallowedPacketErr",
+               SEES(TX_SDMA5_DISALLOWED_PACKET)),
+/*22*/ FLAG_ENTRY0("TxSdma6DisallowedPacketErr",
+               SEES(TX_SDMA6_DISALLOWED_PACKET)),
+/*23*/ FLAG_ENTRY0("TxSdma7DisallowedPacketErr",
+               SEES(TX_SDMA7_DISALLOWED_PACKET)),
+/*24*/ FLAG_ENTRY0("TxSdma8DisallowedPacketErr",
+               SEES(TX_SDMA8_DISALLOWED_PACKET)),
+/*25*/ FLAG_ENTRY0("TxSdma9DisallowedPacketErr",
+               SEES(TX_SDMA9_DISALLOWED_PACKET)),
+/*26*/ FLAG_ENTRY0("TxSdma10DisallowedPacketErr",
+               SEES(TX_SDMA10_DISALLOWED_PACKET)),
+/*27*/ FLAG_ENTRY0("TxSdma11DisallowedPacketErr",
+               SEES(TX_SDMA11_DISALLOWED_PACKET)),
+/*28*/ FLAG_ENTRY0("TxSdma12DisallowedPacketErr",
+               SEES(TX_SDMA12_DISALLOWED_PACKET)),
+/*29*/ FLAG_ENTRY0("TxSdma13DisallowedPacketErr",
+               SEES(TX_SDMA13_DISALLOWED_PACKET)),
+/*30*/ FLAG_ENTRY0("TxSdma14DisallowedPacketErr",
+               SEES(TX_SDMA14_DISALLOWED_PACKET)),
+/*31*/ FLAG_ENTRY0("TxSdma15DisallowedPacketErr",
+               SEES(TX_SDMA15_DISALLOWED_PACKET)),
+/*32*/ FLAG_ENTRY0("TxLaunchFifo0UncOrParityErr",
+               SEES(TX_LAUNCH_FIFO0_UNC_OR_PARITY)),
+/*33*/ FLAG_ENTRY0("TxLaunchFifo1UncOrParityErr",
+               SEES(TX_LAUNCH_FIFO1_UNC_OR_PARITY)),
+/*34*/ FLAG_ENTRY0("TxLaunchFifo2UncOrParityErr",
+               SEES(TX_LAUNCH_FIFO2_UNC_OR_PARITY)),
+/*35*/ FLAG_ENTRY0("TxLaunchFifo3UncOrParityErr",
+               SEES(TX_LAUNCH_FIFO3_UNC_OR_PARITY)),
+/*36*/ FLAG_ENTRY0("TxLaunchFifo4UncOrParityErr",
+               SEES(TX_LAUNCH_FIFO4_UNC_OR_PARITY)),
+/*37*/ FLAG_ENTRY0("TxLaunchFifo5UncOrParityErr",
+               SEES(TX_LAUNCH_FIFO5_UNC_OR_PARITY)),
+/*38*/ FLAG_ENTRY0("TxLaunchFifo6UncOrParityErr",
+               SEES(TX_LAUNCH_FIFO6_UNC_OR_PARITY)),
+/*39*/ FLAG_ENTRY0("TxLaunchFifo7UncOrParityErr",
+               SEES(TX_LAUNCH_FIFO7_UNC_OR_PARITY)),
+/*40*/ FLAG_ENTRY0("TxLaunchFifo8UncOrParityErr",
+               SEES(TX_LAUNCH_FIFO8_UNC_OR_PARITY)),
+/*41*/ FLAG_ENTRY0("TxCreditReturnParityErr", SEES(TX_CREDIT_RETURN_PARITY)),
+/*42*/ FLAG_ENTRY0("TxSbHdrUncErr", SEES(TX_SB_HDR_UNC)),
+/*43*/ FLAG_ENTRY0("TxReadSdmaMemoryUncErr", SEES(TX_READ_SDMA_MEMORY_UNC)),
+/*44*/ FLAG_ENTRY0("TxReadPioMemoryUncErr", SEES(TX_READ_PIO_MEMORY_UNC)),
+/*45*/ FLAG_ENTRY0("TxEgressFifoUncErr", SEES(TX_EGRESS_FIFO_UNC)),
+/*46*/ FLAG_ENTRY0("TxHcrcInsertionErr", SEES(TX_HCRC_INSERTION)),
+/*47*/ FLAG_ENTRY0("TxCreditReturnVLErr", SEES(TX_CREDIT_RETURN_VL)),
+/*48*/ FLAG_ENTRY0("TxLaunchFifo0CorErr", SEES(TX_LAUNCH_FIFO0_COR)),
+/*49*/ FLAG_ENTRY0("TxLaunchFifo1CorErr", SEES(TX_LAUNCH_FIFO1_COR)),
+/*50*/ FLAG_ENTRY0("TxLaunchFifo2CorErr", SEES(TX_LAUNCH_FIFO2_COR)),
+/*51*/ FLAG_ENTRY0("TxLaunchFifo3CorErr", SEES(TX_LAUNCH_FIFO3_COR)),
+/*52*/ FLAG_ENTRY0("TxLaunchFifo4CorErr", SEES(TX_LAUNCH_FIFO4_COR)),
+/*53*/ FLAG_ENTRY0("TxLaunchFifo5CorErr", SEES(TX_LAUNCH_FIFO5_COR)),
+/*54*/ FLAG_ENTRY0("TxLaunchFifo6CorErr", SEES(TX_LAUNCH_FIFO6_COR)),
+/*55*/ FLAG_ENTRY0("TxLaunchFifo7CorErr", SEES(TX_LAUNCH_FIFO7_COR)),
+/*56*/ FLAG_ENTRY0("TxLaunchFifo8CorErr", SEES(TX_LAUNCH_FIFO8_COR)),
+/*57*/ FLAG_ENTRY0("TxCreditOverrunErr", SEES(TX_CREDIT_OVERRUN)),
+/*58*/ FLAG_ENTRY0("TxSbHdrCorErr", SEES(TX_SB_HDR_COR)),
+/*59*/ FLAG_ENTRY0("TxReadSdmaMemoryCorErr", SEES(TX_READ_SDMA_MEMORY_COR)),
+/*60*/ FLAG_ENTRY0("TxReadPioMemoryCorErr", SEES(TX_READ_PIO_MEMORY_COR)),
+/*61*/ FLAG_ENTRY0("TxEgressFifoCorErr", SEES(TX_EGRESS_FIFO_COR)),
+/*62*/ FLAG_ENTRY0("TxReadSdmaMemoryCsrUncErr",
+               SEES(TX_READ_SDMA_MEMORY_CSR_UNC)),
+/*63*/ FLAG_ENTRY0("TxReadPioMemoryCsrUncErr",
+               SEES(TX_READ_PIO_MEMORY_CSR_UNC)),
+};
+
+/*
+ * TXE Egress Error Info flags
+ */
+#define SEEI(text) SEND_EGRESS_ERR_INFO_##text##_ERR_SMASK
+static struct flag_table egress_err_info_flags[] = {
+/* 0*/ FLAG_ENTRY0("Reserved", 0ull),
+/* 1*/ FLAG_ENTRY0("VLErr", SEEI(VL)),
+/* 2*/ FLAG_ENTRY0("JobKeyErr", SEEI(JOB_KEY)),
+/* 3*/ FLAG_ENTRY0("JobKeyErr", SEEI(JOB_KEY)),
+/* 4*/ FLAG_ENTRY0("PartitionKeyErr", SEEI(PARTITION_KEY)),
+/* 5*/ FLAG_ENTRY0("SLIDErr", SEEI(SLID)),
+/* 6*/ FLAG_ENTRY0("OpcodeErr", SEEI(OPCODE)),
+/* 7*/ FLAG_ENTRY0("VLMappingErr", SEEI(VL_MAPPING)),
+/* 8*/ FLAG_ENTRY0("RawErr", SEEI(RAW)),
+/* 9*/ FLAG_ENTRY0("RawIPv6Err", SEEI(RAW_IPV6)),
+/*10*/ FLAG_ENTRY0("GRHErr", SEEI(GRH)),
+/*11*/ FLAG_ENTRY0("BypassErr", SEEI(BYPASS)),
+/*12*/ FLAG_ENTRY0("KDETHPacketsErr", SEEI(KDETH_PACKETS)),
+/*13*/ FLAG_ENTRY0("NonKDETHPacketsErr", SEEI(NON_KDETH_PACKETS)),
+/*14*/ FLAG_ENTRY0("TooSmallIBPacketsErr", SEEI(TOO_SMALL_IB_PACKETS)),
+/*15*/ FLAG_ENTRY0("TooSmallBypassPacketsErr", SEEI(TOO_SMALL_BYPASS_PACKETS)),
+/*16*/ FLAG_ENTRY0("PbcTestErr", SEEI(PBC_TEST)),
+/*17*/ FLAG_ENTRY0("BadPktLenErr", SEEI(BAD_PKT_LEN)),
+/*18*/ FLAG_ENTRY0("TooLongIBPacketErr", SEEI(TOO_LONG_IB_PACKET)),
+/*19*/ FLAG_ENTRY0("TooLongBypassPacketsErr", SEEI(TOO_LONG_BYPASS_PACKETS)),
+/*20*/ FLAG_ENTRY0("PbcStaticRateControlErr", SEEI(PBC_STATIC_RATE_CONTROL)),
+/*21*/ FLAG_ENTRY0("BypassBadPktLenErr", SEEI(BAD_PKT_LEN)),
+};
+
+/* TXE Egress errors that cause an SPC freeze */
+#define ALL_TXE_EGRESS_FREEZE_ERR \
+       (SEES(TX_EGRESS_FIFO_UNDERRUN_OR_PARITY) \
+       | SEES(TX_PIO_LAUNCH_INTF_PARITY) \
+       | SEES(TX_SDMA_LAUNCH_INTF_PARITY) \
+       | SEES(TX_SBRD_CTL_STATE_MACHINE_PARITY) \
+       | SEES(TX_LAUNCH_CSR_PARITY) \
+       | SEES(TX_SBRD_CTL_CSR_PARITY) \
+       | SEES(TX_CONFIG_PARITY) \
+       | SEES(TX_LAUNCH_FIFO0_UNC_OR_PARITY) \
+       | SEES(TX_LAUNCH_FIFO1_UNC_OR_PARITY) \
+       | SEES(TX_LAUNCH_FIFO2_UNC_OR_PARITY) \
+       | SEES(TX_LAUNCH_FIFO3_UNC_OR_PARITY) \
+       | SEES(TX_LAUNCH_FIFO4_UNC_OR_PARITY) \
+       | SEES(TX_LAUNCH_FIFO5_UNC_OR_PARITY) \
+       | SEES(TX_LAUNCH_FIFO6_UNC_OR_PARITY) \
+       | SEES(TX_LAUNCH_FIFO7_UNC_OR_PARITY) \
+       | SEES(TX_LAUNCH_FIFO8_UNC_OR_PARITY) \
+       | SEES(TX_CREDIT_RETURN_PARITY))
+
+/*
+ * TXE Send error flags
+ */
+#define SES(name) SEND_ERR_STATUS_SEND_##name##_ERR_SMASK
+static struct flag_table send_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY0("SDmaRpyTagErr", SES(CSR_PARITY)),
+/* 1*/ FLAG_ENTRY0("SendCsrReadBadAddrErr", SES(CSR_READ_BAD_ADDR)),
+/* 2*/ FLAG_ENTRY0("SendCsrWriteBadAddrErr", SES(CSR_WRITE_BAD_ADDR))
+};
+
+/*
+ * TXE Send Context Error flags and consequences
+ */
+static struct flag_table sc_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY("InconsistentSop",
+               SEC_PACKET_DROPPED | SEC_SC_HALTED,
+               SEND_CTXT_ERR_STATUS_PIO_INCONSISTENT_SOP_ERR_SMASK),
+/* 1*/ FLAG_ENTRY("DisallowedPacket",
+               SEC_PACKET_DROPPED | SEC_SC_HALTED,
+               SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK),
+/* 2*/ FLAG_ENTRY("WriteCrossesBoundary",
+               SEC_WRITE_DROPPED | SEC_SC_HALTED,
+               SEND_CTXT_ERR_STATUS_PIO_WRITE_CROSSES_BOUNDARY_ERR_SMASK),
+/* 3*/ FLAG_ENTRY("WriteOverflow",
+               SEC_WRITE_DROPPED | SEC_SC_HALTED,
+               SEND_CTXT_ERR_STATUS_PIO_WRITE_OVERFLOW_ERR_SMASK),
+/* 4*/ FLAG_ENTRY("WriteOutOfBounds",
+               SEC_WRITE_DROPPED | SEC_SC_HALTED,
+               SEND_CTXT_ERR_STATUS_PIO_WRITE_OUT_OF_BOUNDS_ERR_SMASK),
+/* 5-63 reserved*/
+};
+
+/*
+ * RXE Receive Error flags
+ */
+#define RXES(name) RCV_ERR_STATUS_RX_##name##_ERR_SMASK
+static struct flag_table rxe_err_status_flags[] = {
+/* 0*/ FLAG_ENTRY0("RxDmaCsrCorErr", RXES(DMA_CSR_COR)),
+/* 1*/ FLAG_ENTRY0("RxDcIntfParityErr", RXES(DC_INTF_PARITY)),
+/* 2*/ FLAG_ENTRY0("RxRcvHdrUncErr", RXES(RCV_HDR_UNC)),
+/* 3*/ FLAG_ENTRY0("RxRcvHdrCorErr", RXES(RCV_HDR_COR)),
+/* 4*/ FLAG_ENTRY0("RxRcvDataUncErr", RXES(RCV_DATA_UNC)),
+/* 5*/ FLAG_ENTRY0("RxRcvDataCorErr", RXES(RCV_DATA_COR)),
+/* 6*/ FLAG_ENTRY0("RxRcvQpMapTableUncErr", RXES(RCV_QP_MAP_TABLE_UNC)),
+/* 7*/ FLAG_ENTRY0("RxRcvQpMapTableCorErr", RXES(RCV_QP_MAP_TABLE_COR)),
+/* 8*/ FLAG_ENTRY0("RxRcvCsrParityErr", RXES(RCV_CSR_PARITY)),
+/* 9*/ FLAG_ENTRY0("RxDcSopEopParityErr", RXES(DC_SOP_EOP_PARITY)),
+/*10*/ FLAG_ENTRY0("RxDmaFlagUncErr", RXES(DMA_FLAG_UNC)),
+/*11*/ FLAG_ENTRY0("RxDmaFlagCorErr", RXES(DMA_FLAG_COR)),
+/*12*/ FLAG_ENTRY0("RxRcvFsmEncodingErr", RXES(RCV_FSM_ENCODING)),
+/*13*/ FLAG_ENTRY0("RxRbufFreeListUncErr", RXES(RBUF_FREE_LIST_UNC)),
+/*14*/ FLAG_ENTRY0("RxRbufFreeListCorErr", RXES(RBUF_FREE_LIST_COR)),
+/*15*/ FLAG_ENTRY0("RxRbufLookupDesRegUncErr", RXES(RBUF_LOOKUP_DES_REG_UNC)),
+/*16*/ FLAG_ENTRY0("RxRbufLookupDesRegUncCorErr",
+               RXES(RBUF_LOOKUP_DES_REG_UNC_COR)),
+/*17*/ FLAG_ENTRY0("RxRbufLookupDesUncErr", RXES(RBUF_LOOKUP_DES_UNC)),
+/*18*/ FLAG_ENTRY0("RxRbufLookupDesCorErr", RXES(RBUF_LOOKUP_DES_COR)),
+/*19*/ FLAG_ENTRY0("RxRbufBlockListReadUncErr",
+               RXES(RBUF_BLOCK_LIST_READ_UNC)),
+/*20*/ FLAG_ENTRY0("RxRbufBlockListReadCorErr",
+               RXES(RBUF_BLOCK_LIST_READ_COR)),
+/*21*/ FLAG_ENTRY0("RxRbufCsrQHeadBufNumParityErr",
+               RXES(RBUF_CSR_QHEAD_BUF_NUM_PARITY)),
+/*22*/ FLAG_ENTRY0("RxRbufCsrQEntCntParityErr",
+               RXES(RBUF_CSR_QENT_CNT_PARITY)),
+/*23*/ FLAG_ENTRY0("RxRbufCsrQNextBufParityErr",
+               RXES(RBUF_CSR_QNEXT_BUF_PARITY)),
+/*24*/ FLAG_ENTRY0("RxRbufCsrQVldBitParityErr",
+               RXES(RBUF_CSR_QVLD_BIT_PARITY)),
+/*25*/ FLAG_ENTRY0("RxRbufCsrQHdPtrParityErr", RXES(RBUF_CSR_QHD_PTR_PARITY)),
+/*26*/ FLAG_ENTRY0("RxRbufCsrQTlPtrParityErr", RXES(RBUF_CSR_QTL_PTR_PARITY)),
+/*27*/ FLAG_ENTRY0("RxRbufCsrQNumOfPktParityErr",
+               RXES(RBUF_CSR_QNUM_OF_PKT_PARITY)),
+/*28*/ FLAG_ENTRY0("RxRbufCsrQEOPDWParityErr", RXES(RBUF_CSR_QEOPDW_PARITY)),
+/*29*/ FLAG_ENTRY0("RxRbufCtxIdParityErr", RXES(RBUF_CTX_ID_PARITY)),
+/*30*/ FLAG_ENTRY0("RxRBufBadLookupErr", RXES(RBUF_BAD_LOOKUP)),
+/*31*/ FLAG_ENTRY0("RxRbufFullErr", RXES(RBUF_FULL)),
+/*32*/ FLAG_ENTRY0("RxRbufEmptyErr", RXES(RBUF_EMPTY)),
+/*33*/ FLAG_ENTRY0("RxRbufFlRdAddrParityErr", RXES(RBUF_FL_RD_ADDR_PARITY)),
+/*34*/ FLAG_ENTRY0("RxRbufFlWrAddrParityErr", RXES(RBUF_FL_WR_ADDR_PARITY)),
+/*35*/ FLAG_ENTRY0("RxRbufFlInitdoneParityErr",
+               RXES(RBUF_FL_INITDONE_PARITY)),
+/*36*/ FLAG_ENTRY0("RxRbufFlInitWrAddrParityErr",
+               RXES(RBUF_FL_INIT_WR_ADDR_PARITY)),
+/*37*/ FLAG_ENTRY0("RxRbufNextFreeBufUncErr", RXES(RBUF_NEXT_FREE_BUF_UNC)),
+/*38*/ FLAG_ENTRY0("RxRbufNextFreeBufCorErr", RXES(RBUF_NEXT_FREE_BUF_COR)),
+/*39*/ FLAG_ENTRY0("RxLookupDesPart1UncErr", RXES(LOOKUP_DES_PART1_UNC)),
+/*40*/ FLAG_ENTRY0("RxLookupDesPart1UncCorErr",
+               RXES(LOOKUP_DES_PART1_UNC_COR)),
+/*41*/ FLAG_ENTRY0("RxLookupDesPart2ParityErr",
+               RXES(LOOKUP_DES_PART2_PARITY)),
+/*42*/ FLAG_ENTRY0("RxLookupRcvArrayUncErr", RXES(LOOKUP_RCV_ARRAY_UNC)),
+/*43*/ FLAG_ENTRY0("RxLookupRcvArrayCorErr", RXES(LOOKUP_RCV_ARRAY_COR)),
+/*44*/ FLAG_ENTRY0("RxLookupCsrParityErr", RXES(LOOKUP_CSR_PARITY)),
+/*45*/ FLAG_ENTRY0("RxHqIntrCsrParityErr", RXES(HQ_INTR_CSR_PARITY)),
+/*46*/ FLAG_ENTRY0("RxHqIntrFsmErr", RXES(HQ_INTR_FSM)),
+/*47*/ FLAG_ENTRY0("RxRbufDescPart1UncErr", RXES(RBUF_DESC_PART1_UNC)),
+/*48*/ FLAG_ENTRY0("RxRbufDescPart1CorErr", RXES(RBUF_DESC_PART1_COR)),
+/*49*/ FLAG_ENTRY0("RxRbufDescPart2UncErr", RXES(RBUF_DESC_PART2_UNC)),
+/*50*/ FLAG_ENTRY0("RxRbufDescPart2CorErr", RXES(RBUF_DESC_PART2_COR)),
+/*51*/ FLAG_ENTRY0("RxDmaHdrFifoRdUncErr", RXES(DMA_HDR_FIFO_RD_UNC)),
+/*52*/ FLAG_ENTRY0("RxDmaHdrFifoRdCorErr", RXES(DMA_HDR_FIFO_RD_COR)),
+/*53*/ FLAG_ENTRY0("RxDmaDataFifoRdUncErr", RXES(DMA_DATA_FIFO_RD_UNC)),
+/*54*/ FLAG_ENTRY0("RxDmaDataFifoRdCorErr", RXES(DMA_DATA_FIFO_RD_COR)),
+/*55*/ FLAG_ENTRY0("RxRbufDataUncErr", RXES(RBUF_DATA_UNC)),
+/*56*/ FLAG_ENTRY0("RxRbufDataCorErr", RXES(RBUF_DATA_COR)),
+/*57*/ FLAG_ENTRY0("RxDmaCsrParityErr", RXES(DMA_CSR_PARITY)),
+/*58*/ FLAG_ENTRY0("RxDmaEqFsmEncodingErr", RXES(DMA_EQ_FSM_ENCODING)),
+/*59*/ FLAG_ENTRY0("RxDmaDqFsmEncodingErr", RXES(DMA_DQ_FSM_ENCODING)),
+/*60*/ FLAG_ENTRY0("RxDmaCsrUncErr", RXES(DMA_CSR_UNC)),
+/*61*/ FLAG_ENTRY0("RxCsrReadBadAddrErr", RXES(CSR_READ_BAD_ADDR)),
+/*62*/ FLAG_ENTRY0("RxCsrWriteBadAddrErr", RXES(CSR_WRITE_BAD_ADDR)),
+/*63*/ FLAG_ENTRY0("RxCsrParityErr", RXES(CSR_PARITY))
+};
+
+/* RXE errors that will trigger an SPC freeze */
+#define ALL_RXE_FREEZE_ERR  \
+       (RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RCV_CSR_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_DMA_FLAG_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RCV_FSM_ENCODING_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_FREE_LIST_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_CSR_QHEAD_BUF_NUM_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_CSR_QENT_CNT_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_CSR_QNEXT_BUF_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_CSR_QVLD_BIT_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_CSR_QHD_PTR_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_CSR_QTL_PTR_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_CSR_QNUM_OF_PKT_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_CSR_QEOPDW_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_CTX_ID_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_BAD_LOOKUP_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_FULL_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_EMPTY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_FL_RD_ADDR_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_FL_WR_ADDR_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_FL_INITDONE_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_COR_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_LOOKUP_DES_PART2_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_LOOKUP_CSR_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_HQ_INTR_CSR_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_HQ_INTR_FSM_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_DESC_PART1_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_DESC_PART1_COR_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_DESC_PART2_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_RBUF_DATA_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_DMA_CSR_PARITY_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_DMA_EQ_FSM_ENCODING_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_DMA_DQ_FSM_ENCODING_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK \
+       | RCV_ERR_STATUS_RX_CSR_PARITY_ERR_SMASK)
+
+#define RXE_FREEZE_ABORT_MASK \
+       (RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK | \
+       RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK | \
+       RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK)
+
+/*
+ * DCC Error Flags
+ */
+#define DCCE(name) DCC_ERR_FLG_##name##_SMASK
+static struct flag_table dcc_err_flags[] = {
+       FLAG_ENTRY0("bad_l2_err", DCCE(BAD_L2_ERR)),
+       FLAG_ENTRY0("bad_sc_err", DCCE(BAD_SC_ERR)),
+       FLAG_ENTRY0("bad_mid_tail_err", DCCE(BAD_MID_TAIL_ERR)),
+       FLAG_ENTRY0("bad_preemption_err", DCCE(BAD_PREEMPTION_ERR)),
+       FLAG_ENTRY0("preemption_err", DCCE(PREEMPTION_ERR)),
+       FLAG_ENTRY0("preemptionvl15_err", DCCE(PREEMPTIONVL15_ERR)),
+       FLAG_ENTRY0("bad_vl_marker_err", DCCE(BAD_VL_MARKER_ERR)),
+       FLAG_ENTRY0("bad_dlid_target_err", DCCE(BAD_DLID_TARGET_ERR)),
+       FLAG_ENTRY0("bad_lver_err", DCCE(BAD_LVER_ERR)),
+       FLAG_ENTRY0("uncorrectable_err", DCCE(UNCORRECTABLE_ERR)),
+       FLAG_ENTRY0("bad_crdt_ack_err", DCCE(BAD_CRDT_ACK_ERR)),
+       FLAG_ENTRY0("unsup_pkt_type", DCCE(UNSUP_PKT_TYPE)),
+       FLAG_ENTRY0("bad_ctrl_flit_err", DCCE(BAD_CTRL_FLIT_ERR)),
+       FLAG_ENTRY0("event_cntr_parity_err", DCCE(EVENT_CNTR_PARITY_ERR)),
+       FLAG_ENTRY0("event_cntr_rollover_err", DCCE(EVENT_CNTR_ROLLOVER_ERR)),
+       FLAG_ENTRY0("link_err", DCCE(LINK_ERR)),
+       FLAG_ENTRY0("misc_cntr_rollover_err", DCCE(MISC_CNTR_ROLLOVER_ERR)),
+       FLAG_ENTRY0("bad_ctrl_dist_err", DCCE(BAD_CTRL_DIST_ERR)),
+       FLAG_ENTRY0("bad_tail_dist_err", DCCE(BAD_TAIL_DIST_ERR)),
+       FLAG_ENTRY0("bad_head_dist_err", DCCE(BAD_HEAD_DIST_ERR)),
+       FLAG_ENTRY0("nonvl15_state_err", DCCE(NONVL15_STATE_ERR)),
+       FLAG_ENTRY0("vl15_multi_err", DCCE(VL15_MULTI_ERR)),
+       FLAG_ENTRY0("bad_pkt_length_err", DCCE(BAD_PKT_LENGTH_ERR)),
+       FLAG_ENTRY0("unsup_vl_err", DCCE(UNSUP_VL_ERR)),
+       FLAG_ENTRY0("perm_nvl15_err", DCCE(PERM_NVL15_ERR)),
+       FLAG_ENTRY0("slid_zero_err", DCCE(SLID_ZERO_ERR)),
+       FLAG_ENTRY0("dlid_zero_err", DCCE(DLID_ZERO_ERR)),
+       FLAG_ENTRY0("length_mtu_err", DCCE(LENGTH_MTU_ERR)),
+       FLAG_ENTRY0("rx_early_drop_err", DCCE(RX_EARLY_DROP_ERR)),
+       FLAG_ENTRY0("late_short_err", DCCE(LATE_SHORT_ERR)),
+       FLAG_ENTRY0("late_long_err", DCCE(LATE_LONG_ERR)),
+       FLAG_ENTRY0("late_ebp_err", DCCE(LATE_EBP_ERR)),
+       FLAG_ENTRY0("fpe_tx_fifo_ovflw_err", DCCE(FPE_TX_FIFO_OVFLW_ERR)),
+       FLAG_ENTRY0("fpe_tx_fifo_unflw_err", DCCE(FPE_TX_FIFO_UNFLW_ERR)),
+       FLAG_ENTRY0("csr_access_blocked_host", DCCE(CSR_ACCESS_BLOCKED_HOST)),
+       FLAG_ENTRY0("csr_access_blocked_uc", DCCE(CSR_ACCESS_BLOCKED_UC)),
+       FLAG_ENTRY0("tx_ctrl_parity_err", DCCE(TX_CTRL_PARITY_ERR)),
+       FLAG_ENTRY0("tx_ctrl_parity_mbe_err", DCCE(TX_CTRL_PARITY_MBE_ERR)),
+       FLAG_ENTRY0("tx_sc_parity_err", DCCE(TX_SC_PARITY_ERR)),
+       FLAG_ENTRY0("rx_ctrl_parity_mbe_err", DCCE(RX_CTRL_PARITY_MBE_ERR)),
+       FLAG_ENTRY0("csr_parity_err", DCCE(CSR_PARITY_ERR)),
+       FLAG_ENTRY0("csr_inval_addr", DCCE(CSR_INVAL_ADDR)),
+       FLAG_ENTRY0("tx_byte_shft_parity_err", DCCE(TX_BYTE_SHFT_PARITY_ERR)),
+       FLAG_ENTRY0("rx_byte_shft_parity_err", DCCE(RX_BYTE_SHFT_PARITY_ERR)),
+       FLAG_ENTRY0("fmconfig_err", DCCE(FMCONFIG_ERR)),
+       FLAG_ENTRY0("rcvport_err", DCCE(RCVPORT_ERR)),
+};
+
+/*
+ * LCB error flags
+ */
+#define LCBE(name) DC_LCB_ERR_FLG_##name##_SMASK
+static struct flag_table lcb_err_flags[] = {
+/* 0*/ FLAG_ENTRY0("CSR_PARITY_ERR", LCBE(CSR_PARITY_ERR)),
+/* 1*/ FLAG_ENTRY0("INVALID_CSR_ADDR", LCBE(INVALID_CSR_ADDR)),
+/* 2*/ FLAG_ENTRY0("RST_FOR_FAILED_DESKEW", LCBE(RST_FOR_FAILED_DESKEW)),
+/* 3*/ FLAG_ENTRY0("ALL_LNS_FAILED_REINIT_TEST",
+               LCBE(ALL_LNS_FAILED_REINIT_TEST)),
+/* 4*/ FLAG_ENTRY0("LOST_REINIT_STALL_OR_TOS", LCBE(LOST_REINIT_STALL_OR_TOS)),
+/* 5*/ FLAG_ENTRY0("TX_LESS_THAN_FOUR_LNS", LCBE(TX_LESS_THAN_FOUR_LNS)),
+/* 6*/ FLAG_ENTRY0("RX_LESS_THAN_FOUR_LNS", LCBE(RX_LESS_THAN_FOUR_LNS)),
+/* 7*/ FLAG_ENTRY0("SEQ_CRC_ERR", LCBE(SEQ_CRC_ERR)),
+/* 8*/ FLAG_ENTRY0("REINIT_FROM_PEER", LCBE(REINIT_FROM_PEER)),
+/* 9*/ FLAG_ENTRY0("REINIT_FOR_LN_DEGRADE", LCBE(REINIT_FOR_LN_DEGRADE)),
+/*10*/ FLAG_ENTRY0("CRC_ERR_CNT_HIT_LIMIT", LCBE(CRC_ERR_CNT_HIT_LIMIT)),
+/*11*/ FLAG_ENTRY0("RCLK_STOPPED", LCBE(RCLK_STOPPED)),
+/*12*/ FLAG_ENTRY0("UNEXPECTED_REPLAY_MARKER", LCBE(UNEXPECTED_REPLAY_MARKER)),
+/*13*/ FLAG_ENTRY0("UNEXPECTED_ROUND_TRIP_MARKER",
+               LCBE(UNEXPECTED_ROUND_TRIP_MARKER)),
+/*14*/ FLAG_ENTRY0("ILLEGAL_NULL_LTP", LCBE(ILLEGAL_NULL_LTP)),
+/*15*/ FLAG_ENTRY0("ILLEGAL_FLIT_ENCODING", LCBE(ILLEGAL_FLIT_ENCODING)),
+/*16*/ FLAG_ENTRY0("FLIT_INPUT_BUF_OFLW", LCBE(FLIT_INPUT_BUF_OFLW)),
+/*17*/ FLAG_ENTRY0("VL_ACK_INPUT_BUF_OFLW", LCBE(VL_ACK_INPUT_BUF_OFLW)),
+/*18*/ FLAG_ENTRY0("VL_ACK_INPUT_PARITY_ERR", LCBE(VL_ACK_INPUT_PARITY_ERR)),
+/*19*/ FLAG_ENTRY0("VL_ACK_INPUT_WRONG_CRC_MODE",
+               LCBE(VL_ACK_INPUT_WRONG_CRC_MODE)),
+/*20*/ FLAG_ENTRY0("FLIT_INPUT_BUF_MBE", LCBE(FLIT_INPUT_BUF_MBE)),
+/*21*/ FLAG_ENTRY0("FLIT_INPUT_BUF_SBE", LCBE(FLIT_INPUT_BUF_SBE)),
+/*22*/ FLAG_ENTRY0("REPLAY_BUF_MBE", LCBE(REPLAY_BUF_MBE)),
+/*23*/ FLAG_ENTRY0("REPLAY_BUF_SBE", LCBE(REPLAY_BUF_SBE)),
+/*24*/ FLAG_ENTRY0("CREDIT_RETURN_FLIT_MBE", LCBE(CREDIT_RETURN_FLIT_MBE)),
+/*25*/ FLAG_ENTRY0("RST_FOR_LINK_TIMEOUT", LCBE(RST_FOR_LINK_TIMEOUT)),
+/*26*/ FLAG_ENTRY0("RST_FOR_INCOMPLT_RND_TRIP",
+               LCBE(RST_FOR_INCOMPLT_RND_TRIP)),
+/*27*/ FLAG_ENTRY0("HOLD_REINIT", LCBE(HOLD_REINIT)),
+/*28*/ FLAG_ENTRY0("NEG_EDGE_LINK_TRANSFER_ACTIVE",
+               LCBE(NEG_EDGE_LINK_TRANSFER_ACTIVE)),
+/*29*/ FLAG_ENTRY0("REDUNDANT_FLIT_PARITY_ERR",
+               LCBE(REDUNDANT_FLIT_PARITY_ERR))
+};
+
+/*
+ * DC8051 Error Flags
+ */
+#define D8E(name) DC_DC8051_ERR_FLG_##name##_SMASK
+static struct flag_table dc8051_err_flags[] = {
+       FLAG_ENTRY0("SET_BY_8051", D8E(SET_BY_8051)),
+       FLAG_ENTRY0("LOST_8051_HEART_BEAT", D8E(LOST_8051_HEART_BEAT)),
+       FLAG_ENTRY0("CRAM_MBE", D8E(CRAM_MBE)),
+       FLAG_ENTRY0("CRAM_SBE", D8E(CRAM_SBE)),
+       FLAG_ENTRY0("DRAM_MBE", D8E(DRAM_MBE)),
+       FLAG_ENTRY0("DRAM_SBE", D8E(DRAM_SBE)),
+       FLAG_ENTRY0("IRAM_MBE", D8E(IRAM_MBE)),
+       FLAG_ENTRY0("IRAM_SBE", D8E(IRAM_SBE)),
+       FLAG_ENTRY0("UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES",
+               D8E(UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES)),
+       FLAG_ENTRY0("INVALID_CSR_ADDR", D8E(INVALID_CSR_ADDR)),
+};
+
+/*
+ * DC8051 Information Error flags
+ *
+ * Flags in DC8051_DBG_ERR_INFO_SET_BY_8051.ERROR field.
+ */
+static struct flag_table dc8051_info_err_flags[] = {
+       FLAG_ENTRY0("Spico ROM check failed",  SPICO_ROM_FAILED),
+       FLAG_ENTRY0("Unknown frame received",  UNKNOWN_FRAME),
+       FLAG_ENTRY0("Target BER not met",      TARGET_BER_NOT_MET),
+       FLAG_ENTRY0("Serdes internal loopback failure",
+                                       FAILED_SERDES_INTERNAL_LOOPBACK),
+       FLAG_ENTRY0("Failed SerDes init",      FAILED_SERDES_INIT),
+       FLAG_ENTRY0("Failed LNI(Polling)",     FAILED_LNI_POLLING),
+       FLAG_ENTRY0("Failed LNI(Debounce)",    FAILED_LNI_DEBOUNCE),
+       FLAG_ENTRY0("Failed LNI(EstbComm)",    FAILED_LNI_ESTBCOMM),
+       FLAG_ENTRY0("Failed LNI(OptEq)",       FAILED_LNI_OPTEQ),
+       FLAG_ENTRY0("Failed LNI(VerifyCap_1)", FAILED_LNI_VERIFY_CAP1),
+       FLAG_ENTRY0("Failed LNI(VerifyCap_2)", FAILED_LNI_VERIFY_CAP2),
+       FLAG_ENTRY0("Failed LNI(ConfigLT)",    FAILED_LNI_CONFIGLT)
+};
+
+/*
+ * DC8051 Information Host Information flags
+ *
+ * Flags in DC8051_DBG_ERR_INFO_SET_BY_8051.HOST_MSG field.
+ */
+static struct flag_table dc8051_info_host_msg_flags[] = {
+       FLAG_ENTRY0("Host request done", 0x0001),
+       FLAG_ENTRY0("BC SMA message", 0x0002),
+       FLAG_ENTRY0("BC PWR_MGM message", 0x0004),
+       FLAG_ENTRY0("BC Unknown message (BCC)", 0x0008),
+       FLAG_ENTRY0("BC Unknown message (LCB)", 0x0010),
+       FLAG_ENTRY0("External device config request", 0x0020),
+       FLAG_ENTRY0("VerifyCap all frames received", 0x0040),
+       FLAG_ENTRY0("LinkUp achieved", 0x0080),
+       FLAG_ENTRY0("Link going down", 0x0100),
+};
+
+
+static u32 encoded_size(u32 size);
+static u32 chip_to_opa_lstate(struct hfi1_devdata *dd, u32 chip_lstate);
+static int set_physical_link_state(struct hfi1_devdata *dd, u64 state);
+static void read_vc_remote_phy(struct hfi1_devdata *dd, u8 *power_management,
+                              u8 *continuous);
+static void read_vc_remote_fabric(struct hfi1_devdata *dd, u8 *vau, u8 *z,
+                                 u8 *vcu, u16 *vl15buf, u8 *crc_sizes);
+static void read_vc_remote_link_width(struct hfi1_devdata *dd,
+                                     u8 *remote_tx_rate, u16 *link_widths);
+static void read_vc_local_link_width(struct hfi1_devdata *dd, u8 *misc_bits,
+                                    u8 *flag_bits, u16 *link_widths);
+static void read_remote_device_id(struct hfi1_devdata *dd, u16 *device_id,
+                                 u8 *device_rev);
+static void read_mgmt_allowed(struct hfi1_devdata *dd, u8 *mgmt_allowed);
+static void read_local_lni(struct hfi1_devdata *dd, u8 *enable_lane_rx);
+static int read_tx_settings(struct hfi1_devdata *dd, u8 *enable_lane_tx,
+                           u8 *tx_polarity_inversion,
+                           u8 *rx_polarity_inversion, u8 *max_rate);
+static void handle_sdma_eng_err(struct hfi1_devdata *dd,
+                               unsigned int context, u64 err_status);
+static void handle_qsfp_int(struct hfi1_devdata *dd, u32 source, u64 reg);
+static void handle_dcc_err(struct hfi1_devdata *dd,
+                          unsigned int context, u64 err_status);
+static void handle_lcb_err(struct hfi1_devdata *dd,
+                          unsigned int context, u64 err_status);
+static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_cce_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_rxe_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_misc_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_pio_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_sdma_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_egress_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void handle_txe_err(struct hfi1_devdata *dd, u32 unused, u64 reg);
+static void set_partition_keys(struct hfi1_pportdata *);
+static const char *link_state_name(u32 state);
+static const char *link_state_reason_name(struct hfi1_pportdata *ppd,
+                                         u32 state);
+static int do_8051_command(struct hfi1_devdata *dd, u32 type, u64 in_data,
+                          u64 *out_data);
+static int read_idle_sma(struct hfi1_devdata *dd, u64 *data);
+static int thermal_init(struct hfi1_devdata *dd);
+
+static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state,
+                                 int msecs);
+static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc);
+static void handle_temp_err(struct hfi1_devdata *);
+static void dc_shutdown(struct hfi1_devdata *);
+static void dc_start(struct hfi1_devdata *);
+
+/*
+ * Error interrupt table entry.  This is used as input to the interrupt
+ * "clear down" routine used for all second tier error interrupt register.
+ * Second tier interrupt registers have a single bit representing them
+ * in the top-level CceIntStatus.
+ */
+struct err_reg_info {
+       u32 status;             /* status CSR offset */
+       u32 clear;              /* clear CSR offset */
+       u32 mask;               /* mask CSR offset */
+       void (*handler)(struct hfi1_devdata *dd, u32 source, u64 reg);
+       const char *desc;
+};
+
+#define NUM_MISC_ERRS (IS_GENERAL_ERR_END - IS_GENERAL_ERR_START)
+#define NUM_DC_ERRS (IS_DC_END - IS_DC_START)
+#define NUM_VARIOUS (IS_VARIOUS_END - IS_VARIOUS_START)
+
+/*
+ * Helpers for building HFI and DC error interrupt table entries.  Different
+ * helpers are needed because of inconsistent register names.
+ */
+#define EE(reg, handler, desc) \
+       { reg##_STATUS, reg##_CLEAR, reg##_MASK, \
+               handler, desc }
+#define DC_EE1(reg, handler, desc) \
+       { reg##_FLG, reg##_FLG_CLR, reg##_FLG_EN, handler, desc }
+#define DC_EE2(reg, handler, desc) \
+       { reg##_FLG, reg##_CLR, reg##_EN, handler, desc }
+
+/*
+ * Table of the "misc" grouping of error interrupts.  Each entry refers to
+ * another register containing more information.
+ */
+static const struct err_reg_info misc_errs[NUM_MISC_ERRS] = {
+/* 0*/ EE(CCE_ERR,             handle_cce_err,    "CceErr"),
+/* 1*/ EE(RCV_ERR,             handle_rxe_err,    "RxeErr"),
+/* 2*/ EE(MISC_ERR,    handle_misc_err,   "MiscErr"),
+/* 3*/ { 0, 0, 0, NULL }, /* reserved */
+/* 4*/ EE(SEND_PIO_ERR,    handle_pio_err,    "PioErr"),
+/* 5*/ EE(SEND_DMA_ERR,    handle_sdma_err,   "SDmaErr"),
+/* 6*/ EE(SEND_EGRESS_ERR, handle_egress_err, "EgressErr"),
+/* 7*/ EE(SEND_ERR,    handle_txe_err,    "TxeErr")
+       /* the rest are reserved */
+};
+
+/*
+ * Index into the Various section of the interrupt sources
+ * corresponding to the Critical Temperature interrupt.
+ */
+#define TCRIT_INT_SOURCE 4
+
+/*
+ * SDMA error interrupt entry - refers to another register containing more
+ * information.
+ */
+static const struct err_reg_info sdma_eng_err =
+       EE(SEND_DMA_ENG_ERR, handle_sdma_eng_err, "SDmaEngErr");
+
+static const struct err_reg_info various_err[NUM_VARIOUS] = {
+/* 0*/ { 0, 0, 0, NULL }, /* PbcInt */
+/* 1*/ { 0, 0, 0, NULL }, /* GpioAssertInt */
+/* 2*/ EE(ASIC_QSFP1,  handle_qsfp_int,        "QSFP1"),
+/* 3*/ EE(ASIC_QSFP2,  handle_qsfp_int,        "QSFP2"),
+/* 4*/ { 0, 0, 0, NULL }, /* TCritInt */
+       /* rest are reserved */
+};
+
+/*
+ * The DC encoding of mtu_cap for 10K MTU in the DCC_CFG_PORT_CONFIG
+ * register can not be derived from the MTU value because 10K is not
+ * a power of 2. Therefore, we need a constant. Everything else can
+ * be calculated.
+ */
+#define DCC_CFG_PORT_MTU_CAP_10240 7
+
+/*
+ * Table of the DC grouping of error interrupts.  Each entry refers to
+ * another register containing more information.
+ */
+static const struct err_reg_info dc_errs[NUM_DC_ERRS] = {
+/* 0*/ DC_EE1(DCC_ERR,         handle_dcc_err,        "DCC Err"),
+/* 1*/ DC_EE2(DC_LCB_ERR,      handle_lcb_err,        "LCB Err"),
+/* 2*/ DC_EE2(DC_DC8051_ERR,   handle_8051_interrupt, "DC8051 Interrupt"),
+/* 3*/ /* dc_lbm_int - special, see is_dc_int() */
+       /* the rest are reserved */
+};
+
+struct cntr_entry {
+       /*
+        * counter name
+        */
+       char *name;
+
+       /*
+        * csr to read for name (if applicable)
+        */
+       u64 csr;
+
+       /*
+        * offset into dd or ppd to store the counter's value
+        */
+       int offset;
+
+       /*
+        * flags
+        */
+       u8 flags;
+
+       /*
+        * accessor for stat element, context either dd or ppd
+        */
+       u64 (*rw_cntr)(const struct cntr_entry *,
+                              void *context,
+                              int vl,
+                              int mode,
+                              u64 data);
+};
+
+#define C_RCV_HDR_OVF_FIRST C_RCV_HDR_OVF_0
+#define C_RCV_HDR_OVF_LAST C_RCV_HDR_OVF_159
+
+#define CNTR_ELEM(name, csr, offset, flags, accessor) \
+{ \
+       name, \
+       csr, \
+       offset, \
+       flags, \
+       accessor \
+}
+
+/* 32bit RXE */
+#define RXE32_PORT_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+         (counter * 8 + RCV_COUNTER_ARRAY32), \
+         0, flags | CNTR_32BIT, \
+         port_access_u32_csr)
+
+#define RXE32_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+         (counter * 8 + RCV_COUNTER_ARRAY32), \
+         0, flags | CNTR_32BIT, \
+         dev_access_u32_csr)
+
+/* 64bit RXE */
+#define RXE64_PORT_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+         (counter * 8 + RCV_COUNTER_ARRAY64), \
+         0, flags, \
+         port_access_u64_csr)
+
+#define RXE64_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+         (counter * 8 + RCV_COUNTER_ARRAY64), \
+         0, flags, \
+         dev_access_u64_csr)
+
+#define OVR_LBL(ctx) C_RCV_HDR_OVF_ ## ctx
+#define OVR_ELM(ctx) \
+CNTR_ELEM("RcvHdrOvr" #ctx, \
+         (RCV_HDR_OVFL_CNT + ctx*0x100), \
+         0, CNTR_NORMAL, port_access_u64_csr)
+
+/* 32bit TXE */
+#define TXE32_PORT_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+         (counter * 8 + SEND_COUNTER_ARRAY32), \
+         0, flags | CNTR_32BIT, \
+         port_access_u32_csr)
+
+/* 64bit TXE */
+#define TXE64_PORT_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+         (counter * 8 + SEND_COUNTER_ARRAY64), \
+         0, flags, \
+         port_access_u64_csr)
+
+# define TX64_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name,\
+         counter * 8 + SEND_COUNTER_ARRAY64, \
+         0, \
+         flags, \
+         dev_access_u64_csr)
+
+/* CCE */
+#define CCE_PERF_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+         (counter * 8 + CCE_COUNTER_ARRAY32), \
+         0, flags | CNTR_32BIT, \
+         dev_access_u32_csr)
+
+#define CCE_INT_DEV_CNTR_ELEM(name, counter, flags) \
+CNTR_ELEM(#name, \
+         (counter * 8 + CCE_INT_COUNTER_ARRAY32), \
+         0, flags | CNTR_32BIT, \
+         dev_access_u32_csr)
+
+/* DC */
+#define DC_PERF_CNTR(name, counter, flags) \
+CNTR_ELEM(#name, \
+         counter, \
+         0, \
+         flags, \
+         dev_access_u64_csr)
+
+#define DC_PERF_CNTR_LCB(name, counter, flags) \
+CNTR_ELEM(#name, \
+         counter, \
+         0, \
+         flags, \
+         dc_access_lcb_cntr)
+
+/* ibp counters */
+#define SW_IBP_CNTR(name, cntr) \
+CNTR_ELEM(#name, \
+         0, \
+         0, \
+         CNTR_SYNTH, \
+         access_ibp_##cntr)
+
+u64 read_csr(const struct hfi1_devdata *dd, u32 offset)
+{
+       u64 val;
+
+       if (dd->flags & HFI1_PRESENT) {
+               val = readq((void __iomem *)dd->kregbase + offset);
+               return val;
+       }
+       return -1;
+}
+
+void write_csr(const struct hfi1_devdata *dd, u32 offset, u64 value)
+{
+       if (dd->flags & HFI1_PRESENT)
+               writeq(value, (void __iomem *)dd->kregbase + offset);
+}
+
+void __iomem *get_csr_addr(
+       struct hfi1_devdata *dd,
+       u32 offset)
+{
+       return (void __iomem *)dd->kregbase + offset;
+}
+
+static inline u64 read_write_csr(const struct hfi1_devdata *dd, u32 csr,
+                                int mode, u64 value)
+{
+       u64 ret;
+
+
+       if (mode == CNTR_MODE_R) {
+               ret = read_csr(dd, csr);
+       } else if (mode == CNTR_MODE_W) {
+               write_csr(dd, csr, value);
+               ret = value;
+       } else {
+               dd_dev_err(dd, "Invalid cntr register access mode");
+               return 0;
+       }
+
+       hfi1_cdbg(CNTR, "csr 0x%x val 0x%llx mode %d", csr, ret, mode);
+       return ret;
+}
+
+/* Dev Access */
+static u64 dev_access_u32_csr(const struct cntr_entry *entry,
+                           void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = context;
+
+       if (vl != CNTR_INVALID_VL)
+               return 0;
+       return read_write_csr(dd, entry->csr, mode, data);
+}
+
+static u64 dev_access_u64_csr(const struct cntr_entry *entry, void *context,
+                           int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = context;
+
+       u64 val = 0;
+       u64 csr = entry->csr;
+
+       if (entry->flags & CNTR_VL) {
+               if (vl == CNTR_INVALID_VL)
+                       return 0;
+               csr += 8 * vl;
+       } else {
+               if (vl != CNTR_INVALID_VL)
+                       return 0;
+       }
+
+       val = read_write_csr(dd, csr, mode, data);
+       return val;
+}
+
+static u64 dc_access_lcb_cntr(const struct cntr_entry *entry, void *context,
+                           int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = context;
+       u32 csr = entry->csr;
+       int ret = 0;
+
+       if (vl != CNTR_INVALID_VL)
+               return 0;
+       if (mode == CNTR_MODE_R)
+               ret = read_lcb_csr(dd, csr, &data);
+       else if (mode == CNTR_MODE_W)
+               ret = write_lcb_csr(dd, csr, data);
+
+       if (ret) {
+               dd_dev_err(dd, "Could not acquire LCB for counter 0x%x", csr);
+               return 0;
+       }
+
+       hfi1_cdbg(CNTR, "csr 0x%x val 0x%llx mode %d", csr, data, mode);
+       return data;
+}
+
+/* Port Access */
+static u64 port_access_u32_csr(const struct cntr_entry *entry, void *context,
+                            int vl, int mode, u64 data)
+{
+       struct hfi1_pportdata *ppd = context;
+
+       if (vl != CNTR_INVALID_VL)
+               return 0;
+       return read_write_csr(ppd->dd, entry->csr, mode, data);
+}
+
+static u64 port_access_u64_csr(const struct cntr_entry *entry,
+                            void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_pportdata *ppd = context;
+       u64 val;
+       u64 csr = entry->csr;
+
+       if (entry->flags & CNTR_VL) {
+               if (vl == CNTR_INVALID_VL)
+                       return 0;
+               csr += 8 * vl;
+       } else {
+               if (vl != CNTR_INVALID_VL)
+                       return 0;
+       }
+       val = read_write_csr(ppd->dd, csr, mode, data);
+       return val;
+}
+
+/* Software defined */
+static inline u64 read_write_sw(struct hfi1_devdata *dd, u64 *cntr, int mode,
+                               u64 data)
+{
+       u64 ret;
+
+       if (mode == CNTR_MODE_R) {
+               ret = *cntr;
+       } else if (mode == CNTR_MODE_W) {
+               *cntr = data;
+               ret = data;
+       } else {
+               dd_dev_err(dd, "Invalid cntr sw access mode");
+               return 0;
+       }
+
+       hfi1_cdbg(CNTR, "val 0x%llx mode %d", ret, mode);
+
+       return ret;
+}
+
+static u64 access_sw_link_dn_cnt(const struct cntr_entry *entry, void *context,
+                              int vl, int mode, u64 data)
+{
+       struct hfi1_pportdata *ppd = context;
+
+       if (vl != CNTR_INVALID_VL)
+               return 0;
+       return read_write_sw(ppd->dd, &ppd->link_downed, mode, data);
+}
+
+static u64 access_sw_link_up_cnt(const struct cntr_entry *entry, void *context,
+                              int vl, int mode, u64 data)
+{
+       struct hfi1_pportdata *ppd = context;
+
+       if (vl != CNTR_INVALID_VL)
+               return 0;
+       return read_write_sw(ppd->dd, &ppd->link_up, mode, data);
+}
+
+static u64 access_sw_xmit_discards(const struct cntr_entry *entry,
+                                   void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_pportdata *ppd = context;
+
+       if (vl != CNTR_INVALID_VL)
+               return 0;
+
+       return read_write_sw(ppd->dd, &ppd->port_xmit_discards, mode, data);
+}
+
+static u64 access_xmit_constraint_errs(const struct cntr_entry *entry,
+                                    void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_pportdata *ppd = context;
+
+       if (vl != CNTR_INVALID_VL)
+               return 0;
+
+       return read_write_sw(ppd->dd, &ppd->port_xmit_constraint_errors,
+                            mode, data);
+}
+
+static u64 access_rcv_constraint_errs(const struct cntr_entry *entry,
+                                    void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_pportdata *ppd = context;
+
+       if (vl != CNTR_INVALID_VL)
+               return 0;
+
+       return read_write_sw(ppd->dd, &ppd->port_rcv_constraint_errors,
+                            mode, data);
+}
+
+u64 get_all_cpu_total(u64 __percpu *cntr)
+{
+       int cpu;
+       u64 counter = 0;
+
+       for_each_possible_cpu(cpu)
+               counter += *per_cpu_ptr(cntr, cpu);
+       return counter;
+}
+
+static u64 read_write_cpu(struct hfi1_devdata *dd, u64 *z_val,
+                         u64 __percpu *cntr,
+                         int vl, int mode, u64 data)
+{
+
+       u64 ret = 0;
+
+       if (vl != CNTR_INVALID_VL)
+               return 0;
+
+       if (mode == CNTR_MODE_R) {
+               ret = get_all_cpu_total(cntr) - *z_val;
+       } else if (mode == CNTR_MODE_W) {
+               /* A write can only zero the counter */
+               if (data == 0)
+                       *z_val = get_all_cpu_total(cntr);
+               else
+                       dd_dev_err(dd, "Per CPU cntrs can only be zeroed");
+       } else {
+               dd_dev_err(dd, "Invalid cntr sw cpu access mode");
+               return 0;
+       }
+
+       return ret;
+}
+
+static u64 access_sw_cpu_intr(const struct cntr_entry *entry,
+                             void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = context;
+
+       return read_write_cpu(dd, &dd->z_int_counter, dd->int_counter, vl,
+                             mode, data);
+}
+
+static u64 access_sw_cpu_rcv_limit(const struct cntr_entry *entry,
+                             void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = context;
+
+       return read_write_cpu(dd, &dd->z_rcv_limit, dd->rcv_limit, vl,
+                             mode, data);
+}
+
+static u64 access_sw_pio_wait(const struct cntr_entry *entry,
+                             void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = context;
+
+       return dd->verbs_dev.n_piowait;
+}
+
+static u64 access_sw_vtx_wait(const struct cntr_entry *entry,
+                             void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = context;
+
+       return dd->verbs_dev.n_txwait;
+}
+
+static u64 access_sw_kmem_wait(const struct cntr_entry *entry,
+                              void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = context;
+
+       return dd->verbs_dev.n_kmem_wait;
+}
+
+static u64 access_sw_send_schedule(const struct cntr_entry *entry,
+                              void *context, int vl, int mode, u64 data)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)context;
+
+       return dd->verbs_dev.n_send_schedule;
+}
+
+#define def_access_sw_cpu(cntr) \
+static u64 access_sw_cpu_##cntr(const struct cntr_entry *entry,                      \
+                             void *context, int vl, int mode, u64 data)      \
+{                                                                            \
+       struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;        \
+       return read_write_cpu(ppd->dd, &ppd->ibport_data.z_ ##cntr,           \
+                             ppd->ibport_data.cntr, vl,                      \
+                             mode, data);                                    \
+}
+
+def_access_sw_cpu(rc_acks);
+def_access_sw_cpu(rc_qacks);
+def_access_sw_cpu(rc_delayed_comp);
+
+#define def_access_ibp_counter(cntr) \
+static u64 access_ibp_##cntr(const struct cntr_entry *entry,                 \
+                               void *context, int vl, int mode, u64 data)    \
+{                                                                            \
+       struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context;        \
+                                                                             \
+       if (vl != CNTR_INVALID_VL)                                            \
+               return 0;                                                     \
+                                                                             \
+       return read_write_sw(ppd->dd, &ppd->ibport_data.n_ ##cntr,            \
+                            mode, data);                                     \
+}
+
+def_access_ibp_counter(loop_pkts);
+def_access_ibp_counter(rc_resends);
+def_access_ibp_counter(rnr_naks);
+def_access_ibp_counter(other_naks);
+def_access_ibp_counter(rc_timeouts);
+def_access_ibp_counter(pkt_drops);
+def_access_ibp_counter(dmawait);
+def_access_ibp_counter(rc_seqnak);
+def_access_ibp_counter(rc_dupreq);
+def_access_ibp_counter(rdma_seq);
+def_access_ibp_counter(unaligned);
+def_access_ibp_counter(seq_naks);
+
+static struct cntr_entry dev_cntrs[DEV_CNTR_LAST] = {
+[C_RCV_OVF] = RXE32_DEV_CNTR_ELEM(RcvOverflow, RCV_BUF_OVFL_CNT, CNTR_SYNTH),
+[C_RX_TID_FULL] = RXE32_DEV_CNTR_ELEM(RxTIDFullEr, RCV_TID_FULL_ERR_CNT,
+                       CNTR_NORMAL),
+[C_RX_TID_INVALID] = RXE32_DEV_CNTR_ELEM(RxTIDInvalid, RCV_TID_VALID_ERR_CNT,
+                       CNTR_NORMAL),
+[C_RX_TID_FLGMS] = RXE32_DEV_CNTR_ELEM(RxTidFLGMs,
+                       RCV_TID_FLOW_GEN_MISMATCH_CNT,
+                       CNTR_NORMAL),
+[C_RX_CTX_RHQS] = RXE32_DEV_CNTR_ELEM(RxCtxRHQS, RCV_CONTEXT_RHQ_STALL,
+                       CNTR_NORMAL),
+[C_RX_CTX_EGRS] = RXE32_DEV_CNTR_ELEM(RxCtxEgrS, RCV_CONTEXT_EGR_STALL,
+                       CNTR_NORMAL),
+[C_RCV_TID_FLSMS] = RXE32_DEV_CNTR_ELEM(RxTidFLSMs,
+                       RCV_TID_FLOW_SEQ_MISMATCH_CNT, CNTR_NORMAL),
+[C_CCE_PCI_CR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePciCrSt,
+                       CCE_PCIE_POSTED_CRDT_STALL_CNT, CNTR_NORMAL),
+[C_CCE_PCI_TR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePciTrSt, CCE_PCIE_TRGT_STALL_CNT,
+                       CNTR_NORMAL),
+[C_CCE_PIO_WR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePioWrSt, CCE_PIO_WR_STALL_CNT,
+                       CNTR_NORMAL),
+[C_CCE_ERR_INT] = CCE_INT_DEV_CNTR_ELEM(CceErrInt, CCE_ERR_INT_CNT,
+                       CNTR_NORMAL),
+[C_CCE_SDMA_INT] = CCE_INT_DEV_CNTR_ELEM(CceSdmaInt, CCE_SDMA_INT_CNT,
+                       CNTR_NORMAL),
+[C_CCE_MISC_INT] = CCE_INT_DEV_CNTR_ELEM(CceMiscInt, CCE_MISC_INT_CNT,
+                       CNTR_NORMAL),
+[C_CCE_RCV_AV_INT] = CCE_INT_DEV_CNTR_ELEM(CceRcvAvInt, CCE_RCV_AVAIL_INT_CNT,
+                       CNTR_NORMAL),
+[C_CCE_RCV_URG_INT] = CCE_INT_DEV_CNTR_ELEM(CceRcvUrgInt,
+                       CCE_RCV_URGENT_INT_CNT, CNTR_NORMAL),
+[C_CCE_SEND_CR_INT] = CCE_INT_DEV_CNTR_ELEM(CceSndCrInt,
+                       CCE_SEND_CREDIT_INT_CNT, CNTR_NORMAL),
+[C_DC_UNC_ERR] = DC_PERF_CNTR(DcUnctblErr, DCC_ERR_UNCORRECTABLE_CNT,
+                             CNTR_SYNTH),
+[C_DC_RCV_ERR] = DC_PERF_CNTR(DcRecvErr, DCC_ERR_PORTRCV_ERR_CNT, CNTR_SYNTH),
+[C_DC_FM_CFG_ERR] = DC_PERF_CNTR(DcFmCfgErr, DCC_ERR_FMCONFIG_ERR_CNT,
+                                CNTR_SYNTH),
+[C_DC_RMT_PHY_ERR] = DC_PERF_CNTR(DcRmtPhyErr, DCC_ERR_RCVREMOTE_PHY_ERR_CNT,
+                                 CNTR_SYNTH),
+[C_DC_DROPPED_PKT] = DC_PERF_CNTR(DcDroppedPkt, DCC_ERR_DROPPED_PKT_CNT,
+                                 CNTR_SYNTH),
+[C_DC_MC_XMIT_PKTS] = DC_PERF_CNTR(DcMcXmitPkts,
+                                  DCC_PRF_PORT_XMIT_MULTICAST_CNT, CNTR_SYNTH),
+[C_DC_MC_RCV_PKTS] = DC_PERF_CNTR(DcMcRcvPkts,
+                                 DCC_PRF_PORT_RCV_MULTICAST_PKT_CNT,
+                                 CNTR_SYNTH),
+[C_DC_XMIT_CERR] = DC_PERF_CNTR(DcXmitCorr,
+                               DCC_PRF_PORT_XMIT_CORRECTABLE_CNT, CNTR_SYNTH),
+[C_DC_RCV_CERR] = DC_PERF_CNTR(DcRcvCorrCnt, DCC_PRF_PORT_RCV_CORRECTABLE_CNT,
+                              CNTR_SYNTH),
+[C_DC_RCV_FCC] = DC_PERF_CNTR(DcRxFCntl, DCC_PRF_RX_FLOW_CRTL_CNT,
+                             CNTR_SYNTH),
+[C_DC_XMIT_FCC] = DC_PERF_CNTR(DcXmitFCntl, DCC_PRF_TX_FLOW_CRTL_CNT,
+                              CNTR_SYNTH),
+[C_DC_XMIT_FLITS] = DC_PERF_CNTR(DcXmitFlits, DCC_PRF_PORT_XMIT_DATA_CNT,
+                                CNTR_SYNTH),
+[C_DC_RCV_FLITS] = DC_PERF_CNTR(DcRcvFlits, DCC_PRF_PORT_RCV_DATA_CNT,
+                               CNTR_SYNTH),
+[C_DC_XMIT_PKTS] = DC_PERF_CNTR(DcXmitPkts, DCC_PRF_PORT_XMIT_PKTS_CNT,
+                               CNTR_SYNTH),
+[C_DC_RCV_PKTS] = DC_PERF_CNTR(DcRcvPkts, DCC_PRF_PORT_RCV_PKTS_CNT,
+                              CNTR_SYNTH),
+[C_DC_RX_FLIT_VL] = DC_PERF_CNTR(DcRxFlitVl, DCC_PRF_PORT_VL_RCV_DATA_CNT,
+                                CNTR_SYNTH | CNTR_VL),
+[C_DC_RX_PKT_VL] = DC_PERF_CNTR(DcRxPktVl, DCC_PRF_PORT_VL_RCV_PKTS_CNT,
+                               CNTR_SYNTH | CNTR_VL),
+[C_DC_RCV_FCN] = DC_PERF_CNTR(DcRcvFcn, DCC_PRF_PORT_RCV_FECN_CNT, CNTR_SYNTH),
+[C_DC_RCV_FCN_VL] = DC_PERF_CNTR(DcRcvFcnVl, DCC_PRF_PORT_VL_RCV_FECN_CNT,
+                                CNTR_SYNTH | CNTR_VL),
+[C_DC_RCV_BCN] = DC_PERF_CNTR(DcRcvBcn, DCC_PRF_PORT_RCV_BECN_CNT, CNTR_SYNTH),
+[C_DC_RCV_BCN_VL] = DC_PERF_CNTR(DcRcvBcnVl, DCC_PRF_PORT_VL_RCV_BECN_CNT,
+                                CNTR_SYNTH | CNTR_VL),
+[C_DC_RCV_BBL] = DC_PERF_CNTR(DcRcvBbl, DCC_PRF_PORT_RCV_BUBBLE_CNT,
+                             CNTR_SYNTH),
+[C_DC_RCV_BBL_VL] = DC_PERF_CNTR(DcRcvBblVl, DCC_PRF_PORT_VL_RCV_BUBBLE_CNT,
+                                CNTR_SYNTH | CNTR_VL),
+[C_DC_MARK_FECN] = DC_PERF_CNTR(DcMarkFcn, DCC_PRF_PORT_MARK_FECN_CNT,
+                               CNTR_SYNTH),
+[C_DC_MARK_FECN_VL] = DC_PERF_CNTR(DcMarkFcnVl, DCC_PRF_PORT_VL_MARK_FECN_CNT,
+                                  CNTR_SYNTH | CNTR_VL),
+[C_DC_TOTAL_CRC] =
+       DC_PERF_CNTR_LCB(DcTotCrc, DC_LCB_ERR_INFO_TOTAL_CRC_ERR,
+                        CNTR_SYNTH),
+[C_DC_CRC_LN0] = DC_PERF_CNTR_LCB(DcCrcLn0, DC_LCB_ERR_INFO_CRC_ERR_LN0,
+                                 CNTR_SYNTH),
+[C_DC_CRC_LN1] = DC_PERF_CNTR_LCB(DcCrcLn1, DC_LCB_ERR_INFO_CRC_ERR_LN1,
+                                 CNTR_SYNTH),
+[C_DC_CRC_LN2] = DC_PERF_CNTR_LCB(DcCrcLn2, DC_LCB_ERR_INFO_CRC_ERR_LN2,
+                                 CNTR_SYNTH),
+[C_DC_CRC_LN3] = DC_PERF_CNTR_LCB(DcCrcLn3, DC_LCB_ERR_INFO_CRC_ERR_LN3,
+                                 CNTR_SYNTH),
+[C_DC_CRC_MULT_LN] =
+       DC_PERF_CNTR_LCB(DcMultLn, DC_LCB_ERR_INFO_CRC_ERR_MULTI_LN,
+                        CNTR_SYNTH),
+[C_DC_TX_REPLAY] = DC_PERF_CNTR_LCB(DcTxReplay, DC_LCB_ERR_INFO_TX_REPLAY_CNT,
+                                   CNTR_SYNTH),
+[C_DC_RX_REPLAY] = DC_PERF_CNTR_LCB(DcRxReplay, DC_LCB_ERR_INFO_RX_REPLAY_CNT,
+                                   CNTR_SYNTH),
+[C_DC_SEQ_CRC_CNT] =
+       DC_PERF_CNTR_LCB(DcLinkSeqCrc, DC_LCB_ERR_INFO_SEQ_CRC_CNT,
+                        CNTR_SYNTH),
+[C_DC_ESC0_ONLY_CNT] =
+       DC_PERF_CNTR_LCB(DcEsc0, DC_LCB_ERR_INFO_ESCAPE_0_ONLY_CNT,
+                        CNTR_SYNTH),
+[C_DC_ESC0_PLUS1_CNT] =
+       DC_PERF_CNTR_LCB(DcEsc1, DC_LCB_ERR_INFO_ESCAPE_0_PLUS1_CNT,
+                        CNTR_SYNTH),
+[C_DC_ESC0_PLUS2_CNT] =
+       DC_PERF_CNTR_LCB(DcEsc0Plus2, DC_LCB_ERR_INFO_ESCAPE_0_PLUS2_CNT,
+                        CNTR_SYNTH),
+[C_DC_REINIT_FROM_PEER_CNT] =
+       DC_PERF_CNTR_LCB(DcReinitPeer, DC_LCB_ERR_INFO_REINIT_FROM_PEER_CNT,
+                        CNTR_SYNTH),
+[C_DC_SBE_CNT] = DC_PERF_CNTR_LCB(DcSbe, DC_LCB_ERR_INFO_SBE_CNT,
+                                 CNTR_SYNTH),
+[C_DC_MISC_FLG_CNT] =
+       DC_PERF_CNTR_LCB(DcMiscFlg, DC_LCB_ERR_INFO_MISC_FLG_CNT,
+                        CNTR_SYNTH),
+[C_DC_PRF_GOOD_LTP_CNT] =
+       DC_PERF_CNTR_LCB(DcGoodLTP, DC_LCB_PRF_GOOD_LTP_CNT, CNTR_SYNTH),
+[C_DC_PRF_ACCEPTED_LTP_CNT] =
+       DC_PERF_CNTR_LCB(DcAccLTP, DC_LCB_PRF_ACCEPTED_LTP_CNT,
+                        CNTR_SYNTH),
+[C_DC_PRF_RX_FLIT_CNT] =
+       DC_PERF_CNTR_LCB(DcPrfRxFlit, DC_LCB_PRF_RX_FLIT_CNT, CNTR_SYNTH),
+[C_DC_PRF_TX_FLIT_CNT] =
+       DC_PERF_CNTR_LCB(DcPrfTxFlit, DC_LCB_PRF_TX_FLIT_CNT, CNTR_SYNTH),
+[C_DC_PRF_CLK_CNTR] =
+       DC_PERF_CNTR_LCB(DcPrfClk, DC_LCB_PRF_CLK_CNTR, CNTR_SYNTH),
+[C_DC_PG_DBG_FLIT_CRDTS_CNT] =
+       DC_PERF_CNTR_LCB(DcFltCrdts, DC_LCB_PG_DBG_FLIT_CRDTS_CNT, CNTR_SYNTH),
+[C_DC_PG_STS_PAUSE_COMPLETE_CNT] =
+       DC_PERF_CNTR_LCB(DcPauseComp, DC_LCB_PG_STS_PAUSE_COMPLETE_CNT,
+                        CNTR_SYNTH),
+[C_DC_PG_STS_TX_SBE_CNT] =
+       DC_PERF_CNTR_LCB(DcStsTxSbe, DC_LCB_PG_STS_TX_SBE_CNT, CNTR_SYNTH),
+[C_DC_PG_STS_TX_MBE_CNT] =
+       DC_PERF_CNTR_LCB(DcStsTxMbe, DC_LCB_PG_STS_TX_MBE_CNT,
+                        CNTR_SYNTH),
+[C_SW_CPU_INTR] = CNTR_ELEM("Intr", 0, 0, CNTR_NORMAL,
+                           access_sw_cpu_intr),
+[C_SW_CPU_RCV_LIM] = CNTR_ELEM("RcvLimit", 0, 0, CNTR_NORMAL,
+                           access_sw_cpu_rcv_limit),
+[C_SW_VTX_WAIT] = CNTR_ELEM("vTxWait", 0, 0, CNTR_NORMAL,
+                           access_sw_vtx_wait),
+[C_SW_PIO_WAIT] = CNTR_ELEM("PioWait", 0, 0, CNTR_NORMAL,
+                           access_sw_pio_wait),
+[C_SW_KMEM_WAIT] = CNTR_ELEM("KmemWait", 0, 0, CNTR_NORMAL,
+                           access_sw_kmem_wait),
+[C_SW_SEND_SCHED] = CNTR_ELEM("SendSched", 0, 0, CNTR_NORMAL,
+                           access_sw_send_schedule),
+};
+
+static struct cntr_entry port_cntrs[PORT_CNTR_LAST] = {
+[C_TX_UNSUP_VL] = TXE32_PORT_CNTR_ELEM(TxUnVLErr, SEND_UNSUP_VL_ERR_CNT,
+                       CNTR_NORMAL),
+[C_TX_INVAL_LEN] = TXE32_PORT_CNTR_ELEM(TxInvalLen, SEND_LEN_ERR_CNT,
+                       CNTR_NORMAL),
+[C_TX_MM_LEN_ERR] = TXE32_PORT_CNTR_ELEM(TxMMLenErr, SEND_MAX_MIN_LEN_ERR_CNT,
+                       CNTR_NORMAL),
+[C_TX_UNDERRUN] = TXE32_PORT_CNTR_ELEM(TxUnderrun, SEND_UNDERRUN_CNT,
+                       CNTR_NORMAL),
+[C_TX_FLOW_STALL] = TXE32_PORT_CNTR_ELEM(TxFlowStall, SEND_FLOW_STALL_CNT,
+                       CNTR_NORMAL),
+[C_TX_DROPPED] = TXE32_PORT_CNTR_ELEM(TxDropped, SEND_DROPPED_PKT_CNT,
+                       CNTR_NORMAL),
+[C_TX_HDR_ERR] = TXE32_PORT_CNTR_ELEM(TxHdrErr, SEND_HEADERS_ERR_CNT,
+                       CNTR_NORMAL),
+[C_TX_PKT] = TXE64_PORT_CNTR_ELEM(TxPkt, SEND_DATA_PKT_CNT, CNTR_NORMAL),
+[C_TX_WORDS] = TXE64_PORT_CNTR_ELEM(TxWords, SEND_DWORD_CNT, CNTR_NORMAL),
+[C_TX_WAIT] = TXE64_PORT_CNTR_ELEM(TxWait, SEND_WAIT_CNT, CNTR_SYNTH),
+[C_TX_FLIT_VL] = TXE64_PORT_CNTR_ELEM(TxFlitVL, SEND_DATA_VL0_CNT,
+                       CNTR_SYNTH | CNTR_VL),
+[C_TX_PKT_VL] = TXE64_PORT_CNTR_ELEM(TxPktVL, SEND_DATA_PKT_VL0_CNT,
+                       CNTR_SYNTH | CNTR_VL),
+[C_TX_WAIT_VL] = TXE64_PORT_CNTR_ELEM(TxWaitVL, SEND_WAIT_VL0_CNT,
+                       CNTR_SYNTH | CNTR_VL),
+[C_RX_PKT] = RXE64_PORT_CNTR_ELEM(RxPkt, RCV_DATA_PKT_CNT, CNTR_NORMAL),
+[C_RX_WORDS] = RXE64_PORT_CNTR_ELEM(RxWords, RCV_DWORD_CNT, CNTR_NORMAL),
+[C_SW_LINK_DOWN] = CNTR_ELEM("SwLinkDown", 0, 0, CNTR_SYNTH | CNTR_32BIT,
+                       access_sw_link_dn_cnt),
+[C_SW_LINK_UP] = CNTR_ELEM("SwLinkUp", 0, 0, CNTR_SYNTH | CNTR_32BIT,
+                       access_sw_link_up_cnt),
+[C_SW_XMIT_DSCD] = CNTR_ELEM("XmitDscd", 0, 0, CNTR_SYNTH | CNTR_32BIT,
+                       access_sw_xmit_discards),
+[C_SW_XMIT_DSCD_VL] = CNTR_ELEM("XmitDscdVl", 0, 0,
+                       CNTR_SYNTH | CNTR_32BIT | CNTR_VL,
+                       access_sw_xmit_discards),
+[C_SW_XMIT_CSTR_ERR] = CNTR_ELEM("XmitCstrErr", 0, 0, CNTR_SYNTH,
+                       access_xmit_constraint_errs),
+[C_SW_RCV_CSTR_ERR] = CNTR_ELEM("RcvCstrErr", 0, 0, CNTR_SYNTH,
+                       access_rcv_constraint_errs),
+[C_SW_IBP_LOOP_PKTS] = SW_IBP_CNTR(LoopPkts, loop_pkts),
+[C_SW_IBP_RC_RESENDS] = SW_IBP_CNTR(RcResend, rc_resends),
+[C_SW_IBP_RNR_NAKS] = SW_IBP_CNTR(RnrNak, rnr_naks),
+[C_SW_IBP_OTHER_NAKS] = SW_IBP_CNTR(OtherNak, other_naks),
+[C_SW_IBP_RC_TIMEOUTS] = SW_IBP_CNTR(RcTimeOut, rc_timeouts),
+[C_SW_IBP_PKT_DROPS] = SW_IBP_CNTR(PktDrop, pkt_drops),
+[C_SW_IBP_DMA_WAIT] = SW_IBP_CNTR(DmaWait, dmawait),
+[C_SW_IBP_RC_SEQNAK] = SW_IBP_CNTR(RcSeqNak, rc_seqnak),
+[C_SW_IBP_RC_DUPREQ] = SW_IBP_CNTR(RcDupRew, rc_dupreq),
+[C_SW_IBP_RDMA_SEQ] = SW_IBP_CNTR(RdmaSeq, rdma_seq),
+[C_SW_IBP_UNALIGNED] = SW_IBP_CNTR(Unaligned, unaligned),
+[C_SW_IBP_SEQ_NAK] = SW_IBP_CNTR(SeqNak, seq_naks),
+[C_SW_CPU_RC_ACKS] = CNTR_ELEM("RcAcks", 0, 0, CNTR_NORMAL,
+                              access_sw_cpu_rc_acks),
+[C_SW_CPU_RC_QACKS] = CNTR_ELEM("RcQacks", 0, 0, CNTR_NORMAL,
+                              access_sw_cpu_rc_qacks),
+[C_SW_CPU_RC_DELAYED_COMP] = CNTR_ELEM("RcDelayComp", 0, 0, CNTR_NORMAL,
+                              access_sw_cpu_rc_delayed_comp),
+[OVR_LBL(0)] = OVR_ELM(0), [OVR_LBL(1)] = OVR_ELM(1),
+[OVR_LBL(2)] = OVR_ELM(2), [OVR_LBL(3)] = OVR_ELM(3),
+[OVR_LBL(4)] = OVR_ELM(4), [OVR_LBL(5)] = OVR_ELM(5),
+[OVR_LBL(6)] = OVR_ELM(6), [OVR_LBL(7)] = OVR_ELM(7),
+[OVR_LBL(8)] = OVR_ELM(8), [OVR_LBL(9)] = OVR_ELM(9),
+[OVR_LBL(10)] = OVR_ELM(10), [OVR_LBL(11)] = OVR_ELM(11),
+[OVR_LBL(12)] = OVR_ELM(12), [OVR_LBL(13)] = OVR_ELM(13),
+[OVR_LBL(14)] = OVR_ELM(14), [OVR_LBL(15)] = OVR_ELM(15),
+[OVR_LBL(16)] = OVR_ELM(16), [OVR_LBL(17)] = OVR_ELM(17),
+[OVR_LBL(18)] = OVR_ELM(18), [OVR_LBL(19)] = OVR_ELM(19),
+[OVR_LBL(20)] = OVR_ELM(20), [OVR_LBL(21)] = OVR_ELM(21),
+[OVR_LBL(22)] = OVR_ELM(22), [OVR_LBL(23)] = OVR_ELM(23),
+[OVR_LBL(24)] = OVR_ELM(24), [OVR_LBL(25)] = OVR_ELM(25),
+[OVR_LBL(26)] = OVR_ELM(26), [OVR_LBL(27)] = OVR_ELM(27),
+[OVR_LBL(28)] = OVR_ELM(28), [OVR_LBL(29)] = OVR_ELM(29),
+[OVR_LBL(30)] = OVR_ELM(30), [OVR_LBL(31)] = OVR_ELM(31),
+[OVR_LBL(32)] = OVR_ELM(32), [OVR_LBL(33)] = OVR_ELM(33),
+[OVR_LBL(34)] = OVR_ELM(34), [OVR_LBL(35)] = OVR_ELM(35),
+[OVR_LBL(36)] = OVR_ELM(36), [OVR_LBL(37)] = OVR_ELM(37),
+[OVR_LBL(38)] = OVR_ELM(38), [OVR_LBL(39)] = OVR_ELM(39),
+[OVR_LBL(40)] = OVR_ELM(40), [OVR_LBL(41)] = OVR_ELM(41),
+[OVR_LBL(42)] = OVR_ELM(42), [OVR_LBL(43)] = OVR_ELM(43),
+[OVR_LBL(44)] = OVR_ELM(44), [OVR_LBL(45)] = OVR_ELM(45),
+[OVR_LBL(46)] = OVR_ELM(46), [OVR_LBL(47)] = OVR_ELM(47),
+[OVR_LBL(48)] = OVR_ELM(48), [OVR_LBL(49)] = OVR_ELM(49),
+[OVR_LBL(50)] = OVR_ELM(50), [OVR_LBL(51)] = OVR_ELM(51),
+[OVR_LBL(52)] = OVR_ELM(52), [OVR_LBL(53)] = OVR_ELM(53),
+[OVR_LBL(54)] = OVR_ELM(54), [OVR_LBL(55)] = OVR_ELM(55),
+[OVR_LBL(56)] = OVR_ELM(56), [OVR_LBL(57)] = OVR_ELM(57),
+[OVR_LBL(58)] = OVR_ELM(58), [OVR_LBL(59)] = OVR_ELM(59),
+[OVR_LBL(60)] = OVR_ELM(60), [OVR_LBL(61)] = OVR_ELM(61),
+[OVR_LBL(62)] = OVR_ELM(62), [OVR_LBL(63)] = OVR_ELM(63),
+[OVR_LBL(64)] = OVR_ELM(64), [OVR_LBL(65)] = OVR_ELM(65),
+[OVR_LBL(66)] = OVR_ELM(66), [OVR_LBL(67)] = OVR_ELM(67),
+[OVR_LBL(68)] = OVR_ELM(68), [OVR_LBL(69)] = OVR_ELM(69),
+[OVR_LBL(70)] = OVR_ELM(70), [OVR_LBL(71)] = OVR_ELM(71),
+[OVR_LBL(72)] = OVR_ELM(72), [OVR_LBL(73)] = OVR_ELM(73),
+[OVR_LBL(74)] = OVR_ELM(74), [OVR_LBL(75)] = OVR_ELM(75),
+[OVR_LBL(76)] = OVR_ELM(76), [OVR_LBL(77)] = OVR_ELM(77),
+[OVR_LBL(78)] = OVR_ELM(78), [OVR_LBL(79)] = OVR_ELM(79),
+[OVR_LBL(80)] = OVR_ELM(80), [OVR_LBL(81)] = OVR_ELM(81),
+[OVR_LBL(82)] = OVR_ELM(82), [OVR_LBL(83)] = OVR_ELM(83),
+[OVR_LBL(84)] = OVR_ELM(84), [OVR_LBL(85)] = OVR_ELM(85),
+[OVR_LBL(86)] = OVR_ELM(86), [OVR_LBL(87)] = OVR_ELM(87),
+[OVR_LBL(88)] = OVR_ELM(88), [OVR_LBL(89)] = OVR_ELM(89),
+[OVR_LBL(90)] = OVR_ELM(90), [OVR_LBL(91)] = OVR_ELM(91),
+[OVR_LBL(92)] = OVR_ELM(92), [OVR_LBL(93)] = OVR_ELM(93),
+[OVR_LBL(94)] = OVR_ELM(94), [OVR_LBL(95)] = OVR_ELM(95),
+[OVR_LBL(96)] = OVR_ELM(96), [OVR_LBL(97)] = OVR_ELM(97),
+[OVR_LBL(98)] = OVR_ELM(98), [OVR_LBL(99)] = OVR_ELM(99),
+[OVR_LBL(100)] = OVR_ELM(100), [OVR_LBL(101)] = OVR_ELM(101),
+[OVR_LBL(102)] = OVR_ELM(102), [OVR_LBL(103)] = OVR_ELM(103),
+[OVR_LBL(104)] = OVR_ELM(104), [OVR_LBL(105)] = OVR_ELM(105),
+[OVR_LBL(106)] = OVR_ELM(106), [OVR_LBL(107)] = OVR_ELM(107),
+[OVR_LBL(108)] = OVR_ELM(108), [OVR_LBL(109)] = OVR_ELM(109),
+[OVR_LBL(110)] = OVR_ELM(110), [OVR_LBL(111)] = OVR_ELM(111),
+[OVR_LBL(112)] = OVR_ELM(112), [OVR_LBL(113)] = OVR_ELM(113),
+[OVR_LBL(114)] = OVR_ELM(114), [OVR_LBL(115)] = OVR_ELM(115),
+[OVR_LBL(116)] = OVR_ELM(116), [OVR_LBL(117)] = OVR_ELM(117),
+[OVR_LBL(118)] = OVR_ELM(118), [OVR_LBL(119)] = OVR_ELM(119),
+[OVR_LBL(120)] = OVR_ELM(120), [OVR_LBL(121)] = OVR_ELM(121),
+[OVR_LBL(122)] = OVR_ELM(122), [OVR_LBL(123)] = OVR_ELM(123),
+[OVR_LBL(124)] = OVR_ELM(124), [OVR_LBL(125)] = OVR_ELM(125),
+[OVR_LBL(126)] = OVR_ELM(126), [OVR_LBL(127)] = OVR_ELM(127),
+[OVR_LBL(128)] = OVR_ELM(128), [OVR_LBL(129)] = OVR_ELM(129),
+[OVR_LBL(130)] = OVR_ELM(130), [OVR_LBL(131)] = OVR_ELM(131),
+[OVR_LBL(132)] = OVR_ELM(132), [OVR_LBL(133)] = OVR_ELM(133),
+[OVR_LBL(134)] = OVR_ELM(134), [OVR_LBL(135)] = OVR_ELM(135),
+[OVR_LBL(136)] = OVR_ELM(136), [OVR_LBL(137)] = OVR_ELM(137),
+[OVR_LBL(138)] = OVR_ELM(138), [OVR_LBL(139)] = OVR_ELM(139),
+[OVR_LBL(140)] = OVR_ELM(140), [OVR_LBL(141)] = OVR_ELM(141),
+[OVR_LBL(142)] = OVR_ELM(142), [OVR_LBL(143)] = OVR_ELM(143),
+[OVR_LBL(144)] = OVR_ELM(144), [OVR_LBL(145)] = OVR_ELM(145),
+[OVR_LBL(146)] = OVR_ELM(146), [OVR_LBL(147)] = OVR_ELM(147),
+[OVR_LBL(148)] = OVR_ELM(148), [OVR_LBL(149)] = OVR_ELM(149),
+[OVR_LBL(150)] = OVR_ELM(150), [OVR_LBL(151)] = OVR_ELM(151),
+[OVR_LBL(152)] = OVR_ELM(152), [OVR_LBL(153)] = OVR_ELM(153),
+[OVR_LBL(154)] = OVR_ELM(154), [OVR_LBL(155)] = OVR_ELM(155),
+[OVR_LBL(156)] = OVR_ELM(156), [OVR_LBL(157)] = OVR_ELM(157),
+[OVR_LBL(158)] = OVR_ELM(158), [OVR_LBL(159)] = OVR_ELM(159),
+};
+
+/* ======================================================================== */
+
+/* return true if this is chip revision revision a0 */
+int is_a0(struct hfi1_devdata *dd)
+{
+       return ((dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT)
+                       & CCE_REVISION_CHIP_REV_MINOR_MASK) == 0;
+}
+
+/* return true if this is chip revision revision a */
+int is_ax(struct hfi1_devdata *dd)
+{
+       u8 chip_rev_minor =
+               dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT
+                       & CCE_REVISION_CHIP_REV_MINOR_MASK;
+       return (chip_rev_minor & 0xf0) == 0;
+}
+
+/* return true if this is chip revision revision b */
+int is_bx(struct hfi1_devdata *dd)
+{
+       u8 chip_rev_minor =
+               dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT
+                       & CCE_REVISION_CHIP_REV_MINOR_MASK;
+       return !!(chip_rev_minor & 0x10);
+}
+
+/*
+ * Append string s to buffer buf.  Arguments curp and len are the current
+ * position and remaining length, respectively.
+ *
+ * return 0 on success, 1 on out of room
+ */
+static int append_str(char *buf, char **curp, int *lenp, const char *s)
+{
+       char *p = *curp;
+       int len = *lenp;
+       int result = 0; /* success */
+       char c;
+
+       /* add a comma, if first in the buffer */
+       if (p != buf) {
+               if (len == 0) {
+                       result = 1; /* out of room */
+                       goto done;
+               }
+               *p++ = ',';
+               len--;
+       }
+
+       /* copy the string */
+       while ((c = *s++) != 0) {
+               if (len == 0) {
+                       result = 1; /* out of room */
+                       goto done;
+               }
+               *p++ = c;
+               len--;
+       }
+
+done:
+       /* write return values */
+       *curp = p;
+       *lenp = len;
+
+       return result;
+}
+
+/*
+ * Using the given flag table, print a comma separated string into
+ * the buffer.  End in '*' if the buffer is too short.
+ */
+static char *flag_string(char *buf, int buf_len, u64 flags,
+                               struct flag_table *table, int table_size)
+{
+       char extra[32];
+       char *p = buf;
+       int len = buf_len;
+       int no_room = 0;
+       int i;
+
+       /* make sure there is at least 2 so we can form "*" */
+       if (len < 2)
+               return "";
+
+       len--;  /* leave room for a nul */
+       for (i = 0; i < table_size; i++) {
+               if (flags & table[i].flag) {
+                       no_room = append_str(buf, &p, &len, table[i].str);
+                       if (no_room)
+                               break;
+                       flags &= ~table[i].flag;
+               }
+       }
+
+       /* any undocumented bits left? */
+       if (!no_room && flags) {
+               snprintf(extra, sizeof(extra), "bits 0x%llx", flags);
+               no_room = append_str(buf, &p, &len, extra);
+       }
+
+       /* add * if ran out of room */
+       if (no_room) {
+               /* may need to back up to add space for a '*' */
+               if (len == 0)
+                       --p;
+               *p++ = '*';
+       }
+
+       /* add final nul - space already allocated above */
+       *p = 0;
+       return buf;
+}
+
+/* first 8 CCE error interrupt source names */
+static const char * const cce_misc_names[] = {
+       "CceErrInt",            /* 0 */
+       "RxeErrInt",            /* 1 */
+       "MiscErrInt",           /* 2 */
+       "Reserved3",            /* 3 */
+       "PioErrInt",            /* 4 */
+       "SDmaErrInt",           /* 5 */
+       "EgressErrInt",         /* 6 */
+       "TxeErrInt"             /* 7 */
+};
+
+/*
+ * Return the miscellaneous error interrupt name.
+ */
+static char *is_misc_err_name(char *buf, size_t bsize, unsigned int source)
+{
+       if (source < ARRAY_SIZE(cce_misc_names))
+               strncpy(buf, cce_misc_names[source], bsize);
+       else
+               snprintf(buf,
+                       bsize,
+                       "Reserved%u",
+                       source + IS_GENERAL_ERR_START);
+
+       return buf;
+}
+
+/*
+ * Return the SDMA engine error interrupt name.
+ */
+static char *is_sdma_eng_err_name(char *buf, size_t bsize, unsigned int source)
+{
+       snprintf(buf, bsize, "SDmaEngErrInt%u", source);
+       return buf;
+}
+
+/*
+ * Return the send context error interrupt name.
+ */
+static char *is_sendctxt_err_name(char *buf, size_t bsize, unsigned int source)
+{
+       snprintf(buf, bsize, "SendCtxtErrInt%u", source);
+       return buf;
+}
+
+static const char * const various_names[] = {
+       "PbcInt",
+       "GpioAssertInt",
+       "Qsfp1Int",
+       "Qsfp2Int",
+       "TCritInt"
+};
+
+/*
+ * Return the various interrupt name.
+ */
+static char *is_various_name(char *buf, size_t bsize, unsigned int source)
+{
+       if (source < ARRAY_SIZE(various_names))
+               strncpy(buf, various_names[source], bsize);
+       else
+               snprintf(buf, bsize, "Reserved%u", source+IS_VARIOUS_START);
+       return buf;
+}
+
+/*
+ * Return the DC interrupt name.
+ */
+static char *is_dc_name(char *buf, size_t bsize, unsigned int source)
+{
+       static const char * const dc_int_names[] = {
+               "common",
+               "lcb",
+               "8051",
+               "lbm"   /* local block merge */
+       };
+
+       if (source < ARRAY_SIZE(dc_int_names))
+               snprintf(buf, bsize, "dc_%s_int", dc_int_names[source]);
+       else
+               snprintf(buf, bsize, "DCInt%u", source);
+       return buf;
+}
+
+static const char * const sdma_int_names[] = {
+       "SDmaInt",
+       "SdmaIdleInt",
+       "SdmaProgressInt",
+};
+
+/*
+ * Return the SDMA engine interrupt name.
+ */
+static char *is_sdma_eng_name(char *buf, size_t bsize, unsigned int source)
+{
+       /* what interrupt */
+       unsigned int what  = source / TXE_NUM_SDMA_ENGINES;
+       /* which engine */
+       unsigned int which = source % TXE_NUM_SDMA_ENGINES;
+
+       if (likely(what < 3))
+               snprintf(buf, bsize, "%s%u", sdma_int_names[what], which);
+       else
+               snprintf(buf, bsize, "Invalid SDMA interrupt %u", source);
+       return buf;
+}
+
+/*
+ * Return the receive available interrupt name.
+ */
+static char *is_rcv_avail_name(char *buf, size_t bsize, unsigned int source)
+{
+       snprintf(buf, bsize, "RcvAvailInt%u", source);
+       return buf;
+}
+
+/*
+ * Return the receive urgent interrupt name.
+ */
+static char *is_rcv_urgent_name(char *buf, size_t bsize, unsigned int source)
+{
+       snprintf(buf, bsize, "RcvUrgentInt%u", source);
+       return buf;
+}
+
+/*
+ * Return the send credit interrupt name.
+ */
+static char *is_send_credit_name(char *buf, size_t bsize, unsigned int source)
+{
+       snprintf(buf, bsize, "SendCreditInt%u", source);
+       return buf;
+}
+
+/*
+ * Return the reserved interrupt name.
+ */
+static char *is_reserved_name(char *buf, size_t bsize, unsigned int source)
+{
+       snprintf(buf, bsize, "Reserved%u", source + IS_RESERVED_START);
+       return buf;
+}
+
+static char *cce_err_status_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags,
+                       cce_err_status_flags, ARRAY_SIZE(cce_err_status_flags));
+}
+
+static char *rxe_err_status_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags,
+                       rxe_err_status_flags, ARRAY_SIZE(rxe_err_status_flags));
+}
+
+static char *misc_err_status_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags, misc_err_status_flags,
+                       ARRAY_SIZE(misc_err_status_flags));
+}
+
+static char *pio_err_status_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags,
+                       pio_err_status_flags, ARRAY_SIZE(pio_err_status_flags));
+}
+
+static char *sdma_err_status_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags,
+                       sdma_err_status_flags,
+                       ARRAY_SIZE(sdma_err_status_flags));
+}
+
+static char *egress_err_status_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags,
+               egress_err_status_flags, ARRAY_SIZE(egress_err_status_flags));
+}
+
+static char *egress_err_info_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags,
+               egress_err_info_flags, ARRAY_SIZE(egress_err_info_flags));
+}
+
+static char *send_err_status_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags,
+                       send_err_status_flags,
+                       ARRAY_SIZE(send_err_status_flags));
+}
+
+static void handle_cce_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       char buf[96];
+
+       /*
+        * For most these errors, there is nothing that can be done except
+        * report or record it.
+        */
+       dd_dev_info(dd, "CCE Error: %s\n",
+               cce_err_status_string(buf, sizeof(buf), reg));
+
+       if ((reg & CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK)
+                       && is_a0(dd)
+                       && (dd->icode != ICODE_FUNCTIONAL_SIMULATOR)) {
+               /* this error requires a manual drop into SPC freeze mode */
+               /* then a fix up */
+               start_freeze_handling(dd->pport, FREEZE_SELF);
+       }
+}
+
+/*
+ * Check counters for receive errors that do not have an interrupt
+ * associated with them.
+ */
+#define RCVERR_CHECK_TIME 10
+static void update_rcverr_timer(unsigned long opaque)
+{
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)opaque;
+       struct hfi1_pportdata *ppd = dd->pport;
+       u32 cur_ovfl_cnt = read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL);
+
+       if (dd->rcv_ovfl_cnt < cur_ovfl_cnt &&
+               ppd->port_error_action & OPA_PI_MASK_EX_BUFFER_OVERRUN) {
+               dd_dev_info(dd, "%s: PortErrorAction bounce\n", __func__);
+               set_link_down_reason(ppd,
+                 OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN, 0,
+                       OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN);
+               queue_work(ppd->hfi1_wq, &ppd->link_bounce_work);
+       }
+       dd->rcv_ovfl_cnt = (u32) cur_ovfl_cnt;
+
+       mod_timer(&dd->rcverr_timer, jiffies + HZ * RCVERR_CHECK_TIME);
+}
+
+static int init_rcverr(struct hfi1_devdata *dd)
+{
+       setup_timer(&dd->rcverr_timer, update_rcverr_timer, (unsigned long)dd);
+       /* Assume the hardware counter has been reset */
+       dd->rcv_ovfl_cnt = 0;
+       return mod_timer(&dd->rcverr_timer, jiffies + HZ * RCVERR_CHECK_TIME);
+}
+
+static void free_rcverr(struct hfi1_devdata *dd)
+{
+       if (dd->rcverr_timer.data)
+               del_timer_sync(&dd->rcverr_timer);
+       dd->rcverr_timer.data = 0;
+}
+
+static void handle_rxe_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       char buf[96];
+
+       dd_dev_info(dd, "Receive Error: %s\n",
+               rxe_err_status_string(buf, sizeof(buf), reg));
+
+       if (reg & ALL_RXE_FREEZE_ERR) {
+               int flags = 0;
+
+               /*
+                * Freeze mode recovery is disabled for the errors
+                * in RXE_FREEZE_ABORT_MASK
+                */
+               if (is_a0(dd) && (reg & RXE_FREEZE_ABORT_MASK))
+                       flags = FREEZE_ABORT;
+
+               start_freeze_handling(dd->pport, flags);
+       }
+}
+
+static void handle_misc_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       char buf[96];
+
+       dd_dev_info(dd, "Misc Error: %s",
+               misc_err_status_string(buf, sizeof(buf), reg));
+}
+
+static void handle_pio_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       char buf[96];
+
+       dd_dev_info(dd, "PIO Error: %s\n",
+               pio_err_status_string(buf, sizeof(buf), reg));
+
+       if (reg & ALL_PIO_FREEZE_ERR)
+               start_freeze_handling(dd->pport, 0);
+}
+
+static void handle_sdma_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       char buf[96];
+
+       dd_dev_info(dd, "SDMA Error: %s\n",
+               sdma_err_status_string(buf, sizeof(buf), reg));
+
+       if (reg & ALL_SDMA_FREEZE_ERR)
+               start_freeze_handling(dd->pport, 0);
+}
+
+static void count_port_inactive(struct hfi1_devdata *dd)
+{
+       struct hfi1_pportdata *ppd = dd->pport;
+
+       if (ppd->port_xmit_discards < ~(u64)0)
+               ppd->port_xmit_discards++;
+}
+
+/*
+ * We have had a "disallowed packet" error during egress. Determine the
+ * integrity check which failed, and update relevant error counter, etc.
+ *
+ * Note that the SEND_EGRESS_ERR_INFO register has only a single
+ * bit of state per integrity check, and so we can miss the reason for an
+ * egress error if more than one packet fails the same integrity check
+ * since we cleared the corresponding bit in SEND_EGRESS_ERR_INFO.
+ */
+static void handle_send_egress_err_info(struct hfi1_devdata *dd)
+{
+       struct hfi1_pportdata *ppd = dd->pport;
+       u64 src = read_csr(dd, SEND_EGRESS_ERR_SOURCE); /* read first */
+       u64 info = read_csr(dd, SEND_EGRESS_ERR_INFO);
+       char buf[96];
+
+       /* clear down all observed info as quickly as possible after read */
+       write_csr(dd, SEND_EGRESS_ERR_INFO, info);
+
+       dd_dev_info(dd,
+               "Egress Error Info: 0x%llx, %s Egress Error Src 0x%llx\n",
+               info, egress_err_info_string(buf, sizeof(buf), info), src);
+
+       /* Eventually add other counters for each bit */
+
+       if (info & SEND_EGRESS_ERR_INFO_TOO_LONG_IB_PACKET_ERR_SMASK) {
+               if (ppd->port_xmit_discards < ~(u64)0)
+                       ppd->port_xmit_discards++;
+       }
+}
+
+/*
+ * Input value is a bit position within the SEND_EGRESS_ERR_STATUS
+ * register. Does it represent a 'port inactive' error?
+ */
+static inline int port_inactive_err(u64 posn)
+{
+       return (posn >= SEES(TX_LINKDOWN) &&
+               posn <= SEES(TX_INCORRECT_LINK_STATE));
+}
+
+/*
+ * Input value is a bit position within the SEND_EGRESS_ERR_STATUS
+ * register. Does it represent a 'disallowed packet' error?
+ */
+static inline int disallowed_pkt_err(u64 posn)
+{
+       return (posn >= SEES(TX_SDMA0_DISALLOWED_PACKET) &&
+               posn <= SEES(TX_SDMA15_DISALLOWED_PACKET));
+}
+
+static void handle_egress_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       u64 reg_copy = reg, handled = 0;
+       char buf[96];
+
+       if (reg & ALL_TXE_EGRESS_FREEZE_ERR)
+               start_freeze_handling(dd->pport, 0);
+       if (is_a0(dd) && (reg &
+                   SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_VL_ERR_SMASK)
+                   && (dd->icode != ICODE_FUNCTIONAL_SIMULATOR))
+               start_freeze_handling(dd->pport, 0);
+
+       while (reg_copy) {
+               int posn = fls64(reg_copy);
+               /*
+                * fls64() returns a 1-based offset, but we generally
+                * want 0-based offsets.
+                */
+               int shift = posn - 1;
+
+               if (port_inactive_err(shift)) {
+                       count_port_inactive(dd);
+                       handled |= (1ULL << shift);
+               } else if (disallowed_pkt_err(shift)) {
+                       handle_send_egress_err_info(dd);
+                       handled |= (1ULL << shift);
+               }
+               clear_bit(shift, (unsigned long *)&reg_copy);
+       }
+
+       reg &= ~handled;
+
+       if (reg)
+               dd_dev_info(dd, "Egress Error: %s\n",
+                       egress_err_status_string(buf, sizeof(buf), reg));
+}
+
+static void handle_txe_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       char buf[96];
+
+       dd_dev_info(dd, "Send Error: %s\n",
+               send_err_status_string(buf, sizeof(buf), reg));
+
+}
+
+/*
+ * The maximum number of times the error clear down will loop before
+ * blocking a repeating error.  This value is arbitrary.
+ */
+#define MAX_CLEAR_COUNT 20
+
+/*
+ * Clear and handle an error register.  All error interrupts are funneled
+ * through here to have a central location to correctly handle single-
+ * or multi-shot errors.
+ *
+ * For non per-context registers, call this routine with a context value
+ * of 0 so the per-context offset is zero.
+ *
+ * If the handler loops too many times, assume that something is wrong
+ * and can't be fixed, so mask the error bits.
+ */
+static void interrupt_clear_down(struct hfi1_devdata *dd,
+                                u32 context,
+                                const struct err_reg_info *eri)
+{
+       u64 reg;
+       u32 count;
+
+       /* read in a loop until no more errors are seen */
+       count = 0;
+       while (1) {
+               reg = read_kctxt_csr(dd, context, eri->status);
+               if (reg == 0)
+                       break;
+               write_kctxt_csr(dd, context, eri->clear, reg);
+               if (likely(eri->handler))
+                       eri->handler(dd, context, reg);
+               count++;
+               if (count > MAX_CLEAR_COUNT) {
+                       u64 mask;
+
+                       dd_dev_err(dd, "Repeating %s bits 0x%llx - masking\n",
+                               eri->desc, reg);
+                       /*
+                        * Read-modify-write so any other masked bits
+                        * remain masked.
+                        */
+                       mask = read_kctxt_csr(dd, context, eri->mask);
+                       mask &= ~reg;
+                       write_kctxt_csr(dd, context, eri->mask, mask);
+                       break;
+               }
+       }
+}
+
+/*
+ * CCE block "misc" interrupt.  Source is < 16.
+ */
+static void is_misc_err_int(struct hfi1_devdata *dd, unsigned int source)
+{
+       const struct err_reg_info *eri = &misc_errs[source];
+
+       if (eri->handler) {
+               interrupt_clear_down(dd, 0, eri);
+       } else {
+               dd_dev_err(dd, "Unexpected misc interrupt (%u) - reserved\n",
+                       source);
+       }
+}
+
+static char *send_context_err_status_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags,
+                       sc_err_status_flags, ARRAY_SIZE(sc_err_status_flags));
+}
+
+/*
+ * Send context error interrupt.  Source (hw_context) is < 160.
+ *
+ * All send context errors cause the send context to halt.  The normal
+ * clear-down mechanism cannot be used because we cannot clear the
+ * error bits until several other long-running items are done first.
+ * This is OK because with the context halted, nothing else is going
+ * to happen on it anyway.
+ */
+static void is_sendctxt_err_int(struct hfi1_devdata *dd,
+                               unsigned int hw_context)
+{
+       struct send_context_info *sci;
+       struct send_context *sc;
+       char flags[96];
+       u64 status;
+       u32 sw_index;
+
+       sw_index = dd->hw_to_sw[hw_context];
+       if (sw_index >= dd->num_send_contexts) {
+               dd_dev_err(dd,
+                       "out of range sw index %u for send context %u\n",
+                       sw_index, hw_context);
+               return;
+       }
+       sci = &dd->send_contexts[sw_index];
+       sc = sci->sc;
+       if (!sc) {
+               dd_dev_err(dd, "%s: context %u(%u): no sc?\n", __func__,
+                       sw_index, hw_context);
+               return;
+       }
+
+       /* tell the software that a halt has begun */
+       sc_stop(sc, SCF_HALTED);
+
+       status = read_kctxt_csr(dd, hw_context, SEND_CTXT_ERR_STATUS);
+
+       dd_dev_info(dd, "Send Context %u(%u) Error: %s\n", sw_index, hw_context,
+               send_context_err_status_string(flags, sizeof(flags), status));
+
+       if (status & SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK)
+               handle_send_egress_err_info(dd);
+
+       /*
+        * Automatically restart halted kernel contexts out of interrupt
+        * context.  User contexts must ask the driver to restart the context.
+        */
+       if (sc->type != SC_USER)
+               queue_work(dd->pport->hfi1_wq, &sc->halt_work);
+}
+
+static void handle_sdma_eng_err(struct hfi1_devdata *dd,
+                               unsigned int source, u64 status)
+{
+       struct sdma_engine *sde;
+
+       sde = &dd->per_sdma[source];
+#ifdef CONFIG_SDMA_VERBOSITY
+       dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
+                  slashstrip(__FILE__), __LINE__, __func__);
+       dd_dev_err(sde->dd, "CONFIG SDMA(%u) source: %u status 0x%llx\n",
+                  sde->this_idx, source, (unsigned long long)status);
+#endif
+       sdma_engine_error(sde, status);
+}
+
+/*
+ * CCE block SDMA error interrupt.  Source is < 16.
+ */
+static void is_sdma_eng_err_int(struct hfi1_devdata *dd, unsigned int source)
+{
+#ifdef CONFIG_SDMA_VERBOSITY
+       struct sdma_engine *sde = &dd->per_sdma[source];
+
+       dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
+                  slashstrip(__FILE__), __LINE__, __func__);
+       dd_dev_err(dd, "CONFIG SDMA(%u) source: %u\n", sde->this_idx,
+                  source);
+       sdma_dumpstate(sde);
+#endif
+       interrupt_clear_down(dd, source, &sdma_eng_err);
+}
+
+/*
+ * CCE block "various" interrupt.  Source is < 8.
+ */
+static void is_various_int(struct hfi1_devdata *dd, unsigned int source)
+{
+       const struct err_reg_info *eri = &various_err[source];
+
+       /*
+        * TCritInt cannot go through interrupt_clear_down()
+        * because it is not a second tier interrupt. The handler
+        * should be called directly.
+        */
+       if (source == TCRIT_INT_SOURCE)
+               handle_temp_err(dd);
+       else if (eri->handler)
+               interrupt_clear_down(dd, 0, eri);
+       else
+               dd_dev_info(dd,
+                       "%s: Unimplemented/reserved interrupt %d\n",
+                       __func__, source);
+}
+
+static void handle_qsfp_int(struct hfi1_devdata *dd, u32 src_ctx, u64 reg)
+{
+       /* source is always zero */
+       struct hfi1_pportdata *ppd = dd->pport;
+       unsigned long flags;
+       u64 qsfp_int_mgmt = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N);
+
+       if (reg & QSFP_HFI0_MODPRST_N) {
+
+               dd_dev_info(dd, "%s: ModPresent triggered QSFP interrupt\n",
+                               __func__);
+
+               if (!qsfp_mod_present(ppd)) {
+                       ppd->driver_link_ready = 0;
+                       /*
+                        * Cable removed, reset all our information about the
+                        * cache and cable capabilities
+                        */
+
+                       spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+                       /*
+                        * We don't set cache_refresh_required here as we expect
+                        * an interrupt when a cable is inserted
+                        */
+                       ppd->qsfp_info.cache_valid = 0;
+                       ppd->qsfp_info.qsfp_interrupt_functional = 0;
+                       spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock,
+                                               flags);
+                       write_csr(dd,
+                                       dd->hfi1_id ?
+                                               ASIC_QSFP2_INVERT :
+                                               ASIC_QSFP1_INVERT,
+                               qsfp_int_mgmt);
+                       if (ppd->host_link_state == HLS_DN_POLL) {
+                               /*
+                                * The link is still in POLL. This means
+                                * that the normal link down processing
+                                * will not happen. We have to do it here
+                                * before turning the DC off.
+                                */
+                               queue_work(ppd->hfi1_wq, &ppd->link_down_work);
+                       }
+               } else {
+                       spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+                       ppd->qsfp_info.cache_valid = 0;
+                       ppd->qsfp_info.cache_refresh_required = 1;
+                       spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock,
+                                               flags);
+
+                       qsfp_int_mgmt &= ~(u64)QSFP_HFI0_MODPRST_N;
+                       write_csr(dd,
+                                       dd->hfi1_id ?
+                                               ASIC_QSFP2_INVERT :
+                                               ASIC_QSFP1_INVERT,
+                               qsfp_int_mgmt);
+               }
+       }
+
+       if (reg & QSFP_HFI0_INT_N) {
+
+               dd_dev_info(dd, "%s: IntN triggered QSFP interrupt\n",
+                               __func__);
+               spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+               ppd->qsfp_info.check_interrupt_flags = 1;
+               ppd->qsfp_info.qsfp_interrupt_functional = 1;
+               spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags);
+       }
+
+       /* Schedule the QSFP work only if there is a cable attached. */
+       if (qsfp_mod_present(ppd))
+               queue_work(ppd->hfi1_wq, &ppd->qsfp_info.qsfp_work);
+}
+
+static int request_host_lcb_access(struct hfi1_devdata *dd)
+{
+       int ret;
+
+       ret = do_8051_command(dd, HCMD_MISC,
+               (u64)HCMD_MISC_REQUEST_LCB_ACCESS << LOAD_DATA_FIELD_ID_SHIFT,
+               NULL);
+       if (ret != HCMD_SUCCESS) {
+               dd_dev_err(dd, "%s: command failed with error %d\n",
+                       __func__, ret);
+       }
+       return ret == HCMD_SUCCESS ? 0 : -EBUSY;
+}
+
+static int request_8051_lcb_access(struct hfi1_devdata *dd)
+{
+       int ret;
+
+       ret = do_8051_command(dd, HCMD_MISC,
+               (u64)HCMD_MISC_GRANT_LCB_ACCESS << LOAD_DATA_FIELD_ID_SHIFT,
+               NULL);
+       if (ret != HCMD_SUCCESS) {
+               dd_dev_err(dd, "%s: command failed with error %d\n",
+                       __func__, ret);
+       }
+       return ret == HCMD_SUCCESS ? 0 : -EBUSY;
+}
+
+/*
+ * Set the LCB selector - allow host access.  The DCC selector always
+ * points to the host.
+ */
+static inline void set_host_lcb_access(struct hfi1_devdata *dd)
+{
+       write_csr(dd, DC_DC8051_CFG_CSR_ACCESS_SEL,
+                               DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK
+                               | DC_DC8051_CFG_CSR_ACCESS_SEL_LCB_SMASK);
+}
+
+/*
+ * Clear the LCB selector - allow 8051 access.  The DCC selector always
+ * points to the host.
+ */
+static inline void set_8051_lcb_access(struct hfi1_devdata *dd)
+{
+       write_csr(dd, DC_DC8051_CFG_CSR_ACCESS_SEL,
+                               DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK);
+}
+
+/*
+ * Acquire LCB access from the 8051.  If the host already has access,
+ * just increment a counter.  Otherwise, inform the 8051 that the
+ * host is taking access.
+ *
+ * Returns:
+ *     0 on success
+ *     -EBUSY if the 8051 has control and cannot be disturbed
+ *     -errno if unable to acquire access from the 8051
+ */
+int acquire_lcb_access(struct hfi1_devdata *dd, int sleep_ok)
+{
+       struct hfi1_pportdata *ppd = dd->pport;
+       int ret = 0;
+
+       /*
+        * Use the host link state lock so the operation of this routine
+        * { link state check, selector change, count increment } can occur
+        * as a unit against a link state change.  Otherwise there is a
+        * race between the state change and the count increment.
+        */
+       if (sleep_ok) {
+               mutex_lock(&ppd->hls_lock);
+       } else {
+               while (!mutex_trylock(&ppd->hls_lock))
+                       udelay(1);
+       }
+
+       /* this access is valid only when the link is up */
+       if ((ppd->host_link_state & HLS_UP) == 0) {
+               dd_dev_info(dd, "%s: link state %s not up\n",
+                       __func__, link_state_name(ppd->host_link_state));
+               ret = -EBUSY;
+               goto done;
+       }
+
+       if (dd->lcb_access_count == 0) {
+               ret = request_host_lcb_access(dd);
+               if (ret) {
+                       dd_dev_err(dd,
+                               "%s: unable to acquire LCB access, err %d\n",
+                               __func__, ret);
+                       goto done;
+               }
+               set_host_lcb_access(dd);
+       }
+       dd->lcb_access_count++;
+done:
+       mutex_unlock(&ppd->hls_lock);
+       return ret;
+}
+
+/*
+ * Release LCB access by decrementing the use count.  If the count is moving
+ * from 1 to 0, inform 8051 that it has control back.
+ *
+ * Returns:
+ *     0 on success
+ *     -errno if unable to release access to the 8051
+ */
+int release_lcb_access(struct hfi1_devdata *dd, int sleep_ok)
+{
+       int ret = 0;
+
+       /*
+        * Use the host link state lock because the acquire needed it.
+        * Here, we only need to keep { selector change, count decrement }
+        * as a unit.
+        */
+       if (sleep_ok) {
+               mutex_lock(&dd->pport->hls_lock);
+       } else {
+               while (!mutex_trylock(&dd->pport->hls_lock))
+                       udelay(1);
+       }
+
+       if (dd->lcb_access_count == 0) {
+               dd_dev_err(dd, "%s: LCB access count is zero.  Skipping.\n",
+                       __func__);
+               goto done;
+       }
+
+       if (dd->lcb_access_count == 1) {
+               set_8051_lcb_access(dd);
+               ret = request_8051_lcb_access(dd);
+               if (ret) {
+                       dd_dev_err(dd,
+                               "%s: unable to release LCB access, err %d\n",
+                               __func__, ret);
+                       /* restore host access if the grant didn't work */
+                       set_host_lcb_access(dd);
+                       goto done;
+               }
+       }
+       dd->lcb_access_count--;
+done:
+       mutex_unlock(&dd->pport->hls_lock);
+       return ret;
+}
+
+/*
+ * Initialize LCB access variables and state.  Called during driver load,
+ * after most of the initialization is finished.
+ *
+ * The DC default is LCB access on for the host.  The driver defaults to
+ * leaving access to the 8051.  Assign access now - this constrains the call
+ * to this routine to be after all LCB set-up is done.  In particular, after
+ * hf1_init_dd() -> set_up_interrupts() -> clear_all_interrupts()
+ */
+static void init_lcb_access(struct hfi1_devdata *dd)
+{
+       dd->lcb_access_count = 0;
+}
+
+/*
+ * Write a response back to a 8051 request.
+ */
+static void hreq_response(struct hfi1_devdata *dd, u8 return_code, u16 rsp_data)
+{
+       write_csr(dd, DC_DC8051_CFG_EXT_DEV_0,
+               DC_DC8051_CFG_EXT_DEV_0_COMPLETED_SMASK
+               | (u64)return_code << DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT
+               | (u64)rsp_data << DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT);
+}
+
+/*
+ * Handle requests from the 8051.
+ */
+static void handle_8051_request(struct hfi1_devdata *dd)
+{
+       u64 reg;
+       u16 data;
+       u8 type;
+
+       reg = read_csr(dd, DC_DC8051_CFG_EXT_DEV_1);
+       if ((reg & DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK) == 0)
+               return; /* no request */
+
+       /* zero out COMPLETED so the response is seen */
+       write_csr(dd, DC_DC8051_CFG_EXT_DEV_0, 0);
+
+       /* extract request details */
+       type = (reg >> DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_SHIFT)
+                       & DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_MASK;
+       data = (reg >> DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT)
+                       & DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_MASK;
+
+       switch (type) {
+       case HREQ_LOAD_CONFIG:
+       case HREQ_SAVE_CONFIG:
+       case HREQ_READ_CONFIG:
+       case HREQ_SET_TX_EQ_ABS:
+       case HREQ_SET_TX_EQ_REL:
+       case HREQ_ENABLE:
+               dd_dev_info(dd, "8051 request: request 0x%x not supported\n",
+                       type);
+               hreq_response(dd, HREQ_NOT_SUPPORTED, 0);
+               break;
+
+       case HREQ_CONFIG_DONE:
+               hreq_response(dd, HREQ_SUCCESS, 0);
+               break;
+
+       case HREQ_INTERFACE_TEST:
+               hreq_response(dd, HREQ_SUCCESS, data);
+               break;
+
+       default:
+               dd_dev_err(dd, "8051 request: unknown request 0x%x\n", type);
+               hreq_response(dd, HREQ_NOT_SUPPORTED, 0);
+               break;
+       }
+}
+
+static void write_global_credit(struct hfi1_devdata *dd,
+                               u8 vau, u16 total, u16 shared)
+{
+       write_csr(dd, SEND_CM_GLOBAL_CREDIT,
+               ((u64)total
+                       << SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT)
+               | ((u64)shared
+                       << SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT)
+               | ((u64)vau << SEND_CM_GLOBAL_CREDIT_AU_SHIFT));
+}
+
+/*
+ * Set up initial VL15 credits of the remote.  Assumes the rest of
+ * the CM credit registers are zero from a previous global or credit reset .
+ */
+void set_up_vl15(struct hfi1_devdata *dd, u8 vau, u16 vl15buf)
+{
+       /* leave shared count at zero for both global and VL15 */
+       write_global_credit(dd, vau, vl15buf, 0);
+
+       /* We may need some credits for another VL when sending packets
+        * with the snoop interface. Dividing it down the middle for VL15
+        * and VL0 should suffice.
+        */
+       if (unlikely(dd->hfi1_snoop.mode_flag == HFI1_PORT_SNOOP_MODE)) {
+               write_csr(dd, SEND_CM_CREDIT_VL15, (u64)(vl15buf >> 1)
+                   << SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT);
+               write_csr(dd, SEND_CM_CREDIT_VL, (u64)(vl15buf >> 1)
+                   << SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT);
+       } else {
+               write_csr(dd, SEND_CM_CREDIT_VL15, (u64)vl15buf
+                       << SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT);
+       }
+}
+
+/*
+ * Zero all credit details from the previous connection and
+ * reset the CM manager's internal counters.
+ */
+void reset_link_credits(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /* remove all previous VL credit limits */
+       for (i = 0; i < TXE_NUM_DATA_VL; i++)
+               write_csr(dd, SEND_CM_CREDIT_VL + (8*i), 0);
+       write_csr(dd, SEND_CM_CREDIT_VL15, 0);
+       write_global_credit(dd, 0, 0, 0);
+       /* reset the CM block */
+       pio_send_control(dd, PSC_CM_RESET);
+}
+
+/* convert a vCU to a CU */
+static u32 vcu_to_cu(u8 vcu)
+{
+       return 1 << vcu;
+}
+
+/* convert a CU to a vCU */
+static u8 cu_to_vcu(u32 cu)
+{
+       return ilog2(cu);
+}
+
+/* convert a vAU to an AU */
+static u32 vau_to_au(u8 vau)
+{
+       return 8 * (1 << vau);
+}
+
+static void set_linkup_defaults(struct hfi1_pportdata *ppd)
+{
+       ppd->sm_trap_qp = 0x0;
+       ppd->sa_qp = 0x1;
+}
+
+/*
+ * Graceful LCB shutdown.  This leaves the LCB FIFOs in reset.
+ */
+static void lcb_shutdown(struct hfi1_devdata *dd, int abort)
+{
+       u64 reg;
+
+       /* clear lcb run: LCB_CFG_RUN.EN = 0 */
+       write_csr(dd, DC_LCB_CFG_RUN, 0);
+       /* set tx fifo reset: LCB_CFG_TX_FIFOS_RESET.VAL = 1 */
+       write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET,
+               1ull << DC_LCB_CFG_TX_FIFOS_RESET_VAL_SHIFT);
+       /* set dcc reset csr: DCC_CFG_RESET.{reset_lcb,reset_rx_fpe} = 1 */
+       dd->lcb_err_en = read_csr(dd, DC_LCB_ERR_EN);
+       reg = read_csr(dd, DCC_CFG_RESET);
+       write_csr(dd, DCC_CFG_RESET,
+               reg
+               | (1ull << DCC_CFG_RESET_RESET_LCB_SHIFT)
+               | (1ull << DCC_CFG_RESET_RESET_RX_FPE_SHIFT));
+       (void) read_csr(dd, DCC_CFG_RESET); /* make sure the write completed */
+       if (!abort) {
+               udelay(1);    /* must hold for the longer of 16cclks or 20ns */
+               write_csr(dd, DCC_CFG_RESET, reg);
+               write_csr(dd, DC_LCB_ERR_EN, dd->lcb_err_en);
+       }
+}
+
+/*
+ * This routine should be called after the link has been transitioned to
+ * OFFLINE (OFFLINE state has the side effect of putting the SerDes into
+ * reset).
+ *
+ * The expectation is that the caller of this routine would have taken
+ * care of properly transitioning the link into the correct state.
+ */
+static void dc_shutdown(struct hfi1_devdata *dd)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&dd->dc8051_lock, flags);
+       if (dd->dc_shutdown) {
+               spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+               return;
+       }
+       dd->dc_shutdown = 1;
+       spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+       /* Shutdown the LCB */
+       lcb_shutdown(dd, 1);
+       /* Going to OFFLINE would have causes the 8051 to put the
+        * SerDes into reset already. Just need to shut down the 8051,
+        * itself. */
+       write_csr(dd, DC_DC8051_CFG_RST, 0x1);
+}
+
+/* Calling this after the DC has been brought out of reset should not
+ * do any damage. */
+static void dc_start(struct hfi1_devdata *dd)
+{
+       unsigned long flags;
+       int ret;
+
+       spin_lock_irqsave(&dd->dc8051_lock, flags);
+       if (!dd->dc_shutdown)
+               goto done;
+       spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+       /* Take the 8051 out of reset */
+       write_csr(dd, DC_DC8051_CFG_RST, 0ull);
+       /* Wait until 8051 is ready */
+       ret = wait_fm_ready(dd, TIMEOUT_8051_START);
+       if (ret) {
+               dd_dev_err(dd, "%s: timeout starting 8051 firmware\n",
+                       __func__);
+       }
+       /* Take away reset for LCB and RX FPE (set in lcb_shutdown). */
+       write_csr(dd, DCC_CFG_RESET, 0x10);
+       /* lcb_shutdown() with abort=1 does not restore these */
+       write_csr(dd, DC_LCB_ERR_EN, dd->lcb_err_en);
+       spin_lock_irqsave(&dd->dc8051_lock, flags);
+       dd->dc_shutdown = 0;
+done:
+       spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+}
+
+/*
+ * These LCB adjustments are for the Aurora SerDes core in the FPGA.
+ */
+static void adjust_lcb_for_fpga_serdes(struct hfi1_devdata *dd)
+{
+       u64 rx_radr, tx_radr;
+       u32 version;
+
+       if (dd->icode != ICODE_FPGA_EMULATION)
+               return;
+
+       /*
+        * These LCB defaults on emulator _s are good, nothing to do here:
+        *      LCB_CFG_TX_FIFOS_RADR
+        *      LCB_CFG_RX_FIFOS_RADR
+        *      LCB_CFG_LN_DCLK
+        *      LCB_CFG_IGNORE_LOST_RCLK
+        */
+       if (is_emulator_s(dd))
+               return;
+       /* else this is _p */
+
+       version = emulator_rev(dd);
+       if (!is_a0(dd))
+               version = 0x2d; /* all B0 use 0x2d or higher settings */
+
+       if (version <= 0x12) {
+               /* release 0x12 and below */
+
+               /*
+                * LCB_CFG_RX_FIFOS_RADR.RST_VAL = 0x9
+                * LCB_CFG_RX_FIFOS_RADR.OK_TO_JUMP_VAL = 0x9
+                * LCB_CFG_RX_FIFOS_RADR.DO_NOT_JUMP_VAL = 0xa
+                */
+               rx_radr =
+                     0xaull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+                   | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+                   | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+               /*
+                * LCB_CFG_TX_FIFOS_RADR.ON_REINIT = 0 (default)
+                * LCB_CFG_TX_FIFOS_RADR.RST_VAL = 6
+                */
+               tx_radr = 6ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+       } else if (version <= 0x18) {
+               /* release 0x13 up to 0x18 */
+               /* LCB_CFG_RX_FIFOS_RADR = 0x988 */
+               rx_radr =
+                     0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+                   | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+                   | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+               tx_radr = 7ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+       } else if (version == 0x19) {
+               /* release 0x19 */
+               /* LCB_CFG_RX_FIFOS_RADR = 0xa99 */
+               rx_radr =
+                     0xAull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+                   | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+                   | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+               tx_radr = 3ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+       } else if (version == 0x1a) {
+               /* release 0x1a */
+               /* LCB_CFG_RX_FIFOS_RADR = 0x988 */
+               rx_radr =
+                     0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+                   | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+                   | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+               tx_radr = 7ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+               write_csr(dd, DC_LCB_CFG_LN_DCLK, 1ull);
+       } else {
+               /* release 0x1b and higher */
+               /* LCB_CFG_RX_FIFOS_RADR = 0x877 */
+               rx_radr =
+                     0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT
+                   | 0x7ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT
+                   | 0x7ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT;
+               tx_radr = 3ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT;
+       }
+
+       write_csr(dd, DC_LCB_CFG_RX_FIFOS_RADR, rx_radr);
+       /* LCB_CFG_IGNORE_LOST_RCLK.EN = 1 */
+       write_csr(dd, DC_LCB_CFG_IGNORE_LOST_RCLK,
+               DC_LCB_CFG_IGNORE_LOST_RCLK_EN_SMASK);
+       write_csr(dd, DC_LCB_CFG_TX_FIFOS_RADR, tx_radr);
+}
+
+/*
+ * Handle a SMA idle message
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_sma_message(struct work_struct *work)
+{
+       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+                                                       sma_message_work);
+       struct hfi1_devdata *dd = ppd->dd;
+       u64 msg;
+       int ret;
+
+       /* msg is bytes 1-4 of the 40-bit idle message - the command code
+          is stripped off */
+       ret = read_idle_sma(dd, &msg);
+       if (ret)
+               return;
+       dd_dev_info(dd, "%s: SMA message 0x%llx\n", __func__, msg);
+       /*
+        * React to the SMA message.  Byte[1] (0 for us) is the command.
+        */
+       switch (msg & 0xff) {
+       case SMA_IDLE_ARM:
+               /*
+                * See OPAv1 table 9-14 - HFI and External Switch Ports Key
+                * State Transitions
+                *
+                * Only expected in INIT or ARMED, discard otherwise.
+                */
+               if (ppd->host_link_state & (HLS_UP_INIT | HLS_UP_ARMED))
+                       ppd->neighbor_normal = 1;
+               break;
+       case SMA_IDLE_ACTIVE:
+               /*
+                * See OPAv1 table 9-14 - HFI and External Switch Ports Key
+                * State Transitions
+                *
+                * Can activate the node.  Discard otherwise.
+                */
+               if (ppd->host_link_state == HLS_UP_ARMED
+                                       && ppd->is_active_optimize_enabled) {
+                       ppd->neighbor_normal = 1;
+                       ret = set_link_state(ppd, HLS_UP_ACTIVE);
+                       if (ret)
+                               dd_dev_err(
+                                       dd,
+                                       "%s: received Active SMA idle message, couldn't set link to Active\n",
+                                       __func__);
+               }
+               break;
+       default:
+               dd_dev_err(dd,
+                       "%s: received unexpected SMA idle message 0x%llx\n",
+                       __func__, msg);
+               break;
+       }
+}
+
+static void adjust_rcvctrl(struct hfi1_devdata *dd, u64 add, u64 clear)
+{
+       u64 rcvctrl;
+       unsigned long flags;
+
+       spin_lock_irqsave(&dd->rcvctrl_lock, flags);
+       rcvctrl = read_csr(dd, RCV_CTRL);
+       rcvctrl |= add;
+       rcvctrl &= ~clear;
+       write_csr(dd, RCV_CTRL, rcvctrl);
+       spin_unlock_irqrestore(&dd->rcvctrl_lock, flags);
+}
+
+static inline void add_rcvctrl(struct hfi1_devdata *dd, u64 add)
+{
+       adjust_rcvctrl(dd, add, 0);
+}
+
+static inline void clear_rcvctrl(struct hfi1_devdata *dd, u64 clear)
+{
+       adjust_rcvctrl(dd, 0, clear);
+}
+
+/*
+ * Called from all interrupt handlers to start handling an SPC freeze.
+ */
+void start_freeze_handling(struct hfi1_pportdata *ppd, int flags)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       struct send_context *sc;
+       int i;
+
+       if (flags & FREEZE_SELF)
+               write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_FREEZE_SMASK);
+
+       /* enter frozen mode */
+       dd->flags |= HFI1_FROZEN;
+
+       /* notify all SDMA engines that they are going into a freeze */
+       sdma_freeze_notify(dd, !!(flags & FREEZE_LINK_DOWN));
+
+       /* do halt pre-handling on all enabled send contexts */
+       for (i = 0; i < dd->num_send_contexts; i++) {
+               sc = dd->send_contexts[i].sc;
+               if (sc && (sc->flags & SCF_ENABLED))
+                       sc_stop(sc, SCF_FROZEN | SCF_HALTED);
+       }
+
+       /* Send context are frozen. Notify user space */
+       hfi1_set_uevent_bits(ppd, _HFI1_EVENT_FROZEN_BIT);
+
+       if (flags & FREEZE_ABORT) {
+               dd_dev_err(dd,
+                          "Aborted freeze recovery. Please REBOOT system\n");
+               return;
+       }
+       /* queue non-interrupt handler */
+       queue_work(ppd->hfi1_wq, &ppd->freeze_work);
+}
+
+/*
+ * Wait until all 4 sub-blocks indicate that they have frozen or unfrozen,
+ * depending on the "freeze" parameter.
+ *
+ * No need to return an error if it times out, our only option
+ * is to proceed anyway.
+ */
+static void wait_for_freeze_status(struct hfi1_devdata *dd, int freeze)
+{
+       unsigned long timeout;
+       u64 reg;
+
+       timeout = jiffies + msecs_to_jiffies(FREEZE_STATUS_TIMEOUT);
+       while (1) {
+               reg = read_csr(dd, CCE_STATUS);
+               if (freeze) {
+                       /* waiting until all indicators are set */
+                       if ((reg & ALL_FROZE) == ALL_FROZE)
+                               return; /* all done */
+               } else {
+                       /* waiting until all indicators are clear */
+                       if ((reg & ALL_FROZE) == 0)
+                               return; /* all done */
+               }
+
+               if (time_after(jiffies, timeout)) {
+                       dd_dev_err(dd,
+                               "Time out waiting for SPC %sfreeze, bits 0x%llx, expecting 0x%llx, continuing",
+                               freeze ? "" : "un",
+                               reg & ALL_FROZE,
+                               freeze ? ALL_FROZE : 0ull);
+                       return;
+               }
+               usleep_range(80, 120);
+       }
+}
+
+/*
+ * Do all freeze handling for the RXE block.
+ */
+static void rxe_freeze(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /* disable port */
+       clear_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+
+       /* disable all receive contexts */
+       for (i = 0; i < dd->num_rcv_contexts; i++)
+               hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS, i);
+}
+
+/*
+ * Unfreeze handling for the RXE block - kernel contexts only.
+ * This will also enable the port.  User contexts will do unfreeze
+ * handling on a per-context basis as they call into the driver.
+ *
+ */
+static void rxe_kernel_unfreeze(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /* enable all kernel contexts */
+       for (i = 0; i < dd->n_krcv_queues; i++)
+               hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_ENB, i);
+
+       /* enable port */
+       add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+}
+
+/*
+ * Non-interrupt SPC freeze handling.
+ *
+ * This is a work-queue function outside of the triggering interrupt.
+ */
+void handle_freeze(struct work_struct *work)
+{
+       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+                                                               freeze_work);
+       struct hfi1_devdata *dd = ppd->dd;
+
+       /* wait for freeze indicators on all affected blocks */
+       dd_dev_info(dd, "Entering SPC freeze\n");
+       wait_for_freeze_status(dd, 1);
+
+       /* SPC is now frozen */
+
+       /* do send PIO freeze steps */
+       pio_freeze(dd);
+
+       /* do send DMA freeze steps */
+       sdma_freeze(dd);
+
+       /* do send egress freeze steps - nothing to do */
+
+       /* do receive freeze steps */
+       rxe_freeze(dd);
+
+       /*
+        * Unfreeze the hardware - clear the freeze, wait for each
+        * block's frozen bit to clear, then clear the frozen flag.
+        */
+       write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_UNFREEZE_SMASK);
+       wait_for_freeze_status(dd, 0);
+
+       if (is_a0(dd)) {
+               write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_FREEZE_SMASK);
+               wait_for_freeze_status(dd, 1);
+               write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_UNFREEZE_SMASK);
+               wait_for_freeze_status(dd, 0);
+       }
+
+       /* do send PIO unfreeze steps for kernel contexts */
+       pio_kernel_unfreeze(dd);
+
+       /* do send DMA unfreeze steps */
+       sdma_unfreeze(dd);
+
+       /* do send egress unfreeze steps - nothing to do */
+
+       /* do receive unfreeze steps for kernel contexts */
+       rxe_kernel_unfreeze(dd);
+
+       /*
+        * The unfreeze procedure touches global device registers when
+        * it disables and re-enables RXE. Mark the device unfrozen
+        * after all that is done so other parts of the driver waiting
+        * for the device to unfreeze don't do things out of order.
+        *
+        * The above implies that the meaning of HFI1_FROZEN flag is
+        * "Device has gone into freeze mode and freeze mode handling
+        * is still in progress."
+        *
+        * The flag will be removed when freeze mode processing has
+        * completed.
+        */
+       dd->flags &= ~HFI1_FROZEN;
+       wake_up(&dd->event_queue);
+
+       /* no longer frozen */
+       dd_dev_err(dd, "Exiting SPC freeze\n");
+}
+
+/*
+ * Handle a link up interrupt from the 8051.
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_link_up(struct work_struct *work)
+{
+       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+                                                               link_up_work);
+       set_link_state(ppd, HLS_UP_INIT);
+
+       /* cache the read of DC_LCB_STS_ROUND_TRIP_LTP_CNT */
+       read_ltp_rtt(ppd->dd);
+       /*
+        * OPA specifies that certain counters are cleared on a transition
+        * to link up, so do that.
+        */
+       clear_linkup_counters(ppd->dd);
+       /*
+        * And (re)set link up default values.
+        */
+       set_linkup_defaults(ppd);
+
+       /* enforce link speed enabled */
+       if ((ppd->link_speed_active & ppd->link_speed_enabled) == 0) {
+               /* oops - current speed is not enabled, bounce */
+               dd_dev_err(ppd->dd,
+                       "Link speed active 0x%x is outside enabled 0x%x, downing link\n",
+                       ppd->link_speed_active, ppd->link_speed_enabled);
+               set_link_down_reason(ppd, OPA_LINKDOWN_REASON_SPEED_POLICY, 0,
+                       OPA_LINKDOWN_REASON_SPEED_POLICY);
+               set_link_state(ppd, HLS_DN_OFFLINE);
+               start_link(ppd);
+       }
+}
+
+/* Several pieces of LNI information were cached for SMA in ppd.
+ * Reset these on link down */
+static void reset_neighbor_info(struct hfi1_pportdata *ppd)
+{
+       ppd->neighbor_guid = 0;
+       ppd->neighbor_port_number = 0;
+       ppd->neighbor_type = 0;
+       ppd->neighbor_fm_security = 0;
+}
+
+/*
+ * Handle a link down interrupt from the 8051.
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_link_down(struct work_struct *work)
+{
+       u8 lcl_reason, neigh_reason = 0;
+       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+                                                               link_down_work);
+
+       /* go offline first, then deal with reasons */
+       set_link_state(ppd, HLS_DN_OFFLINE);
+
+       lcl_reason = 0;
+       read_planned_down_reason_code(ppd->dd, &neigh_reason);
+
+       /*
+        * If no reason, assume peer-initiated but missed
+        * LinkGoingDown idle flits.
+        */
+       if (neigh_reason == 0)
+               lcl_reason = OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN;
+
+       set_link_down_reason(ppd, lcl_reason, neigh_reason, 0);
+
+       reset_neighbor_info(ppd);
+
+       /* disable the port */
+       clear_rcvctrl(ppd->dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+
+       /* If there is no cable attached, turn the DC off. Otherwise,
+        * start the link bring up. */
+       if (!qsfp_mod_present(ppd))
+               dc_shutdown(ppd->dd);
+       else
+               start_link(ppd);
+}
+
+void handle_link_bounce(struct work_struct *work)
+{
+       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+                                                       link_bounce_work);
+
+       /*
+        * Only do something if the link is currently up.
+        */
+       if (ppd->host_link_state & HLS_UP) {
+               set_link_state(ppd, HLS_DN_OFFLINE);
+               start_link(ppd);
+       } else {
+               dd_dev_info(ppd->dd, "%s: link not up (%s), nothing to do\n",
+                       __func__, link_state_name(ppd->host_link_state));
+       }
+}
+
+/*
+ * Mask conversion: Capability exchange to Port LTP.  The capability
+ * exchange has an implicit 16b CRC that is mandatory.
+ */
+static int cap_to_port_ltp(int cap)
+{
+       int port_ltp = PORT_LTP_CRC_MODE_16; /* this mode is mandatory */
+
+       if (cap & CAP_CRC_14B)
+               port_ltp |= PORT_LTP_CRC_MODE_14;
+       if (cap & CAP_CRC_48B)
+               port_ltp |= PORT_LTP_CRC_MODE_48;
+       if (cap & CAP_CRC_12B_16B_PER_LANE)
+               port_ltp |= PORT_LTP_CRC_MODE_PER_LANE;
+
+       return port_ltp;
+}
+
+/*
+ * Convert an OPA Port LTP mask to capability mask
+ */
+int port_ltp_to_cap(int port_ltp)
+{
+       int cap_mask = 0;
+
+       if (port_ltp & PORT_LTP_CRC_MODE_14)
+               cap_mask |= CAP_CRC_14B;
+       if (port_ltp & PORT_LTP_CRC_MODE_48)
+               cap_mask |= CAP_CRC_48B;
+       if (port_ltp & PORT_LTP_CRC_MODE_PER_LANE)
+               cap_mask |= CAP_CRC_12B_16B_PER_LANE;
+
+       return cap_mask;
+}
+
+/*
+ * Convert a single DC LCB CRC mode to an OPA Port LTP mask.
+ */
+static int lcb_to_port_ltp(int lcb_crc)
+{
+       int port_ltp = 0;
+
+       if (lcb_crc == LCB_CRC_12B_16B_PER_LANE)
+               port_ltp = PORT_LTP_CRC_MODE_PER_LANE;
+       else if (lcb_crc == LCB_CRC_48B)
+               port_ltp = PORT_LTP_CRC_MODE_48;
+       else if (lcb_crc == LCB_CRC_14B)
+               port_ltp = PORT_LTP_CRC_MODE_14;
+       else
+               port_ltp = PORT_LTP_CRC_MODE_16;
+
+       return port_ltp;
+}
+
+/*
+ * Our neighbor has indicated that we are allowed to act as a fabric
+ * manager, so place the full management partition key in the second
+ * (0-based) pkey array position (see OPAv1, section 20.2.2.6.8). Note
+ * that we should already have the limited management partition key in
+ * array element 1, and also that the port is not yet up when
+ * add_full_mgmt_pkey() is invoked.
+ */
+static void add_full_mgmt_pkey(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+
+       /* Sanity check - ppd->pkeys[2] should be 0 */
+       if (ppd->pkeys[2] != 0)
+               dd_dev_err(dd, "%s pkey[2] already set to 0x%x, resetting it to 0x%x\n",
+                          __func__, ppd->pkeys[2], FULL_MGMT_P_KEY);
+       ppd->pkeys[2] = FULL_MGMT_P_KEY;
+       (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0);
+}
+
+/*
+ * Convert the given link width to the OPA link width bitmask.
+ */
+static u16 link_width_to_bits(struct hfi1_devdata *dd, u16 width)
+{
+       switch (width) {
+       case 0:
+               /*
+                * Simulator and quick linkup do not set the width.
+                * Just set it to 4x without complaint.
+                */
+               if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR || quick_linkup)
+                       return OPA_LINK_WIDTH_4X;
+               return 0; /* no lanes up */
+       case 1: return OPA_LINK_WIDTH_1X;
+       case 2: return OPA_LINK_WIDTH_2X;
+       case 3: return OPA_LINK_WIDTH_3X;
+       default:
+               dd_dev_info(dd, "%s: invalid width %d, using 4\n",
+                       __func__, width);
+               /* fall through */
+       case 4: return OPA_LINK_WIDTH_4X;
+       }
+}
+
+/*
+ * Do a population count on the bottom nibble.
+ */
+static const u8 bit_counts[16] = {
+       0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4
+};
+static inline u8 nibble_to_count(u8 nibble)
+{
+       return bit_counts[nibble & 0xf];
+}
+
+/*
+ * Read the active lane information from the 8051 registers and return
+ * their widths.
+ *
+ * Active lane information is found in these 8051 registers:
+ *     enable_lane_tx
+ *     enable_lane_rx
+ */
+static void get_link_widths(struct hfi1_devdata *dd, u16 *tx_width,
+                           u16 *rx_width)
+{
+       u16 tx, rx;
+       u8 enable_lane_rx;
+       u8 enable_lane_tx;
+       u8 tx_polarity_inversion;
+       u8 rx_polarity_inversion;
+       u8 max_rate;
+
+       /* read the active lanes */
+       read_tx_settings(dd, &enable_lane_tx, &tx_polarity_inversion,
+                               &rx_polarity_inversion, &max_rate);
+       read_local_lni(dd, &enable_lane_rx);
+
+       /* convert to counts */
+       tx = nibble_to_count(enable_lane_tx);
+       rx = nibble_to_count(enable_lane_rx);
+
+       /*
+        * Set link_speed_active here, overriding what was set in
+        * handle_verify_cap().  The ASIC 8051 firmware does not correctly
+        * set the max_rate field in handle_verify_cap until v0.19.
+        */
+       if ((dd->icode == ICODE_RTL_SILICON)
+                               && (dd->dc8051_ver < dc8051_ver(0, 19))) {
+               /* max_rate: 0 = 12.5G, 1 = 25G */
+               switch (max_rate) {
+               case 0:
+                       dd->pport[0].link_speed_active = OPA_LINK_SPEED_12_5G;
+                       break;
+               default:
+                       dd_dev_err(dd,
+                               "%s: unexpected max rate %d, using 25Gb\n",
+                               __func__, (int)max_rate);
+                       /* fall through */
+               case 1:
+                       dd->pport[0].link_speed_active = OPA_LINK_SPEED_25G;
+                       break;
+               }
+       }
+
+       dd_dev_info(dd,
+               "Fabric active lanes (width): tx 0x%x (%d), rx 0x%x (%d)\n",
+               enable_lane_tx, tx, enable_lane_rx, rx);
+       *tx_width = link_width_to_bits(dd, tx);
+       *rx_width = link_width_to_bits(dd, rx);
+}
+
+/*
+ * Read verify_cap_local_fm_link_width[1] to obtain the link widths.
+ * Valid after the end of VerifyCap and during LinkUp.  Does not change
+ * after link up.  I.e. look elsewhere for downgrade information.
+ *
+ * Bits are:
+ *     + bits [7:4] contain the number of active transmitters
+ *     + bits [3:0] contain the number of active receivers
+ * These are numbers 1 through 4 and can be different values if the
+ * link is asymmetric.
+ *
+ * verify_cap_local_fm_link_width[0] retains its original value.
+ */
+static void get_linkup_widths(struct hfi1_devdata *dd, u16 *tx_width,
+                             u16 *rx_width)
+{
+       u16 widths, tx, rx;
+       u8 misc_bits, local_flags;
+       u16 active_tx, active_rx;
+
+       read_vc_local_link_width(dd, &misc_bits, &local_flags, &widths);
+       tx = widths >> 12;
+       rx = (widths >> 8) & 0xf;
+
+       *tx_width = link_width_to_bits(dd, tx);
+       *rx_width = link_width_to_bits(dd, rx);
+
+       /* print the active widths */
+       get_link_widths(dd, &active_tx, &active_rx);
+}
+
+/*
+ * Set ppd->link_width_active and ppd->link_width_downgrade_active using
+ * hardware information when the link first comes up.
+ *
+ * The link width is not available until after VerifyCap.AllFramesReceived
+ * (the trigger for handle_verify_cap), so this is outside that routine
+ * and should be called when the 8051 signals linkup.
+ */
+void get_linkup_link_widths(struct hfi1_pportdata *ppd)
+{
+       u16 tx_width, rx_width;
+
+       /* get end-of-LNI link widths */
+       get_linkup_widths(ppd->dd, &tx_width, &rx_width);
+
+       /* use tx_width as the link is supposed to be symmetric on link up */
+       ppd->link_width_active = tx_width;
+       /* link width downgrade active (LWD.A) starts out matching LW.A */
+       ppd->link_width_downgrade_tx_active = ppd->link_width_active;
+       ppd->link_width_downgrade_rx_active = ppd->link_width_active;
+       /* per OPA spec, on link up LWD.E resets to LWD.S */
+       ppd->link_width_downgrade_enabled = ppd->link_width_downgrade_supported;
+       /* cache the active egress rate (units {10^6 bits/sec]) */
+       ppd->current_egress_rate = active_egress_rate(ppd);
+}
+
+/*
+ * Handle a verify capabilities interrupt from the 8051.
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_verify_cap(struct work_struct *work)
+{
+       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+                                                               link_vc_work);
+       struct hfi1_devdata *dd = ppd->dd;
+       u64 reg;
+       u8 power_management;
+       u8 continious;
+       u8 vcu;
+       u8 vau;
+       u8 z;
+       u16 vl15buf;
+       u16 link_widths;
+       u16 crc_mask;
+       u16 crc_val;
+       u16 device_id;
+       u16 active_tx, active_rx;
+       u8 partner_supported_crc;
+       u8 remote_tx_rate;
+       u8 device_rev;
+
+       set_link_state(ppd, HLS_VERIFY_CAP);
+
+       lcb_shutdown(dd, 0);
+       adjust_lcb_for_fpga_serdes(dd);
+
+       /*
+        * These are now valid:
+        *      remote VerifyCap fields in the general LNI config
+        *      CSR DC8051_STS_REMOTE_GUID
+        *      CSR DC8051_STS_REMOTE_NODE_TYPE
+        *      CSR DC8051_STS_REMOTE_FM_SECURITY
+        *      CSR DC8051_STS_REMOTE_PORT_NO
+        */
+
+       read_vc_remote_phy(dd, &power_management, &continious);
+       read_vc_remote_fabric(
+               dd,
+               &vau,
+               &z,
+               &vcu,
+               &vl15buf,
+               &partner_supported_crc);
+       read_vc_remote_link_width(dd, &remote_tx_rate, &link_widths);
+       read_remote_device_id(dd, &device_id, &device_rev);
+       /*
+        * And the 'MgmtAllowed' information, which is exchanged during
+        * LNI, is also be available at this point.
+        */
+       read_mgmt_allowed(dd, &ppd->mgmt_allowed);
+       /* print the active widths */
+       get_link_widths(dd, &active_tx, &active_rx);
+       dd_dev_info(dd,
+               "Peer PHY: power management 0x%x, continuous updates 0x%x\n",
+               (int)power_management, (int)continious);
+       dd_dev_info(dd,
+               "Peer Fabric: vAU %d, Z %d, vCU %d, vl15 credits 0x%x, CRC sizes 0x%x\n",
+               (int)vau,
+               (int)z,
+               (int)vcu,
+               (int)vl15buf,
+               (int)partner_supported_crc);
+       dd_dev_info(dd, "Peer Link Width: tx rate 0x%x, widths 0x%x\n",
+               (u32)remote_tx_rate, (u32)link_widths);
+       dd_dev_info(dd, "Peer Device ID: 0x%04x, Revision 0x%02x\n",
+               (u32)device_id, (u32)device_rev);
+       /*
+        * The peer vAU value just read is the peer receiver value.  HFI does
+        * not support a transmit vAU of 0 (AU == 8).  We advertised that
+        * with Z=1 in the fabric capabilities sent to the peer.  The peer
+        * will see our Z=1, and, if it advertised a vAU of 0, will move its
+        * receive to vAU of 1 (AU == 16).  Do the same here.  We do not care
+        * about the peer Z value - our sent vAU is 3 (hardwired) and is not
+        * subject to the Z value exception.
+        */
+       if (vau == 0)
+               vau = 1;
+       set_up_vl15(dd, vau, vl15buf);
+
+       /* set up the LCB CRC mode */
+       crc_mask = ppd->port_crc_mode_enabled & partner_supported_crc;
+
+       /* order is important: use the lowest bit in common */
+       if (crc_mask & CAP_CRC_14B)
+               crc_val = LCB_CRC_14B;
+       else if (crc_mask & CAP_CRC_48B)
+               crc_val = LCB_CRC_48B;
+       else if (crc_mask & CAP_CRC_12B_16B_PER_LANE)
+               crc_val = LCB_CRC_12B_16B_PER_LANE;
+       else
+               crc_val = LCB_CRC_16B;
+
+       dd_dev_info(dd, "Final LCB CRC mode: %d\n", (int)crc_val);
+       write_csr(dd, DC_LCB_CFG_CRC_MODE,
+                 (u64)crc_val << DC_LCB_CFG_CRC_MODE_TX_VAL_SHIFT);
+
+       /* set (14b only) or clear sideband credit */
+       reg = read_csr(dd, SEND_CM_CTRL);
+       if (crc_val == LCB_CRC_14B && crc_14b_sideband) {
+               write_csr(dd, SEND_CM_CTRL,
+                       reg | SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK);
+       } else {
+               write_csr(dd, SEND_CM_CTRL,
+                       reg & ~SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK);
+       }
+
+       ppd->link_speed_active = 0;     /* invalid value */
+       if (dd->dc8051_ver < dc8051_ver(0, 20)) {
+               /* remote_tx_rate: 0 = 12.5G, 1 = 25G */
+               switch (remote_tx_rate) {
+               case 0:
+                       ppd->link_speed_active = OPA_LINK_SPEED_12_5G;
+                       break;
+               case 1:
+                       ppd->link_speed_active = OPA_LINK_SPEED_25G;
+                       break;
+               }
+       } else {
+               /* actual rate is highest bit of the ANDed rates */
+               u8 rate = remote_tx_rate & ppd->local_tx_rate;
+
+               if (rate & 2)
+                       ppd->link_speed_active = OPA_LINK_SPEED_25G;
+               else if (rate & 1)
+                       ppd->link_speed_active = OPA_LINK_SPEED_12_5G;
+       }
+       if (ppd->link_speed_active == 0) {
+               dd_dev_err(dd, "%s: unexpected remote tx rate %d, using 25Gb\n",
+                       __func__, (int)remote_tx_rate);
+               ppd->link_speed_active = OPA_LINK_SPEED_25G;
+       }
+
+       /*
+        * Cache the values of the supported, enabled, and active
+        * LTP CRC modes to return in 'portinfo' queries. But the bit
+        * flags that are returned in the portinfo query differ from
+        * what's in the link_crc_mask, crc_sizes, and crc_val
+        * variables. Convert these here.
+        */
+       ppd->port_ltp_crc_mode = cap_to_port_ltp(link_crc_mask) << 8;
+               /* supported crc modes */
+       ppd->port_ltp_crc_mode |=
+               cap_to_port_ltp(ppd->port_crc_mode_enabled) << 4;
+               /* enabled crc modes */
+       ppd->port_ltp_crc_mode |= lcb_to_port_ltp(crc_val);
+               /* active crc mode */
+
+       /* set up the remote credit return table */
+       assign_remote_cm_au_table(dd, vcu);
+
+       /*
+        * The LCB is reset on entry to handle_verify_cap(), so this must
+        * be applied on every link up.
+        *
+        * Adjust LCB error kill enable to kill the link if
+        * these RBUF errors are seen:
+        *      REPLAY_BUF_MBE_SMASK
+        *      FLIT_INPUT_BUF_MBE_SMASK
+        */
+       if (is_a0(dd)) {                        /* fixed in B0 */
+               reg = read_csr(dd, DC_LCB_CFG_LINK_KILL_EN);
+               reg |= DC_LCB_CFG_LINK_KILL_EN_REPLAY_BUF_MBE_SMASK
+                       | DC_LCB_CFG_LINK_KILL_EN_FLIT_INPUT_BUF_MBE_SMASK;
+               write_csr(dd, DC_LCB_CFG_LINK_KILL_EN, reg);
+       }
+
+       /* pull LCB fifos out of reset - all fifo clocks must be stable */
+       write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0);
+
+       /* give 8051 access to the LCB CSRs */
+       write_csr(dd, DC_LCB_ERR_EN, 0); /* mask LCB errors */
+       set_8051_lcb_access(dd);
+
+       ppd->neighbor_guid =
+               read_csr(dd, DC_DC8051_STS_REMOTE_GUID);
+       ppd->neighbor_port_number = read_csr(dd, DC_DC8051_STS_REMOTE_PORT_NO) &
+                                       DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK;
+       ppd->neighbor_type =
+               read_csr(dd, DC_DC8051_STS_REMOTE_NODE_TYPE) &
+               DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK;
+       ppd->neighbor_fm_security =
+               read_csr(dd, DC_DC8051_STS_REMOTE_FM_SECURITY) &
+               DC_DC8051_STS_LOCAL_FM_SECURITY_DISABLED_MASK;
+       dd_dev_info(dd,
+               "Neighbor Guid: %llx Neighbor type %d MgmtAllowed %d FM security bypass %d\n",
+               ppd->neighbor_guid, ppd->neighbor_type,
+               ppd->mgmt_allowed, ppd->neighbor_fm_security);
+       if (ppd->mgmt_allowed)
+               add_full_mgmt_pkey(ppd);
+
+       /* tell the 8051 to go to LinkUp */
+       set_link_state(ppd, HLS_GOING_UP);
+}
+
+/*
+ * Apply the link width downgrade enabled policy against the current active
+ * link widths.
+ *
+ * Called when the enabled policy changes or the active link widths change.
+ */
+void apply_link_downgrade_policy(struct hfi1_pportdata *ppd, int refresh_widths)
+{
+       int skip = 1;
+       int do_bounce = 0;
+       u16 lwde = ppd->link_width_downgrade_enabled;
+       u16 tx, rx;
+
+       mutex_lock(&ppd->hls_lock);
+       /* only apply if the link is up */
+       if (ppd->host_link_state & HLS_UP)
+               skip = 0;
+       mutex_unlock(&ppd->hls_lock);
+       if (skip)
+               return;
+
+       if (refresh_widths) {
+               get_link_widths(ppd->dd, &tx, &rx);
+               ppd->link_width_downgrade_tx_active = tx;
+               ppd->link_width_downgrade_rx_active = rx;
+       }
+
+       if (lwde == 0) {
+               /* downgrade is disabled */
+
+               /* bounce if not at starting active width */
+               if ((ppd->link_width_active !=
+                                       ppd->link_width_downgrade_tx_active)
+                               || (ppd->link_width_active !=
+                                       ppd->link_width_downgrade_rx_active)) {
+                       dd_dev_err(ppd->dd,
+                               "Link downgrade is disabled and link has downgraded, downing link\n");
+                       dd_dev_err(ppd->dd,
+                               "  original 0x%x, tx active 0x%x, rx active 0x%x\n",
+                               ppd->link_width_active,
+                               ppd->link_width_downgrade_tx_active,
+                               ppd->link_width_downgrade_rx_active);
+                       do_bounce = 1;
+               }
+       } else if ((lwde & ppd->link_width_downgrade_tx_active) == 0
+               || (lwde & ppd->link_width_downgrade_rx_active) == 0) {
+               /* Tx or Rx is outside the enabled policy */
+               dd_dev_err(ppd->dd,
+                       "Link is outside of downgrade allowed, downing link\n");
+               dd_dev_err(ppd->dd,
+                       "  enabled 0x%x, tx active 0x%x, rx active 0x%x\n",
+                       lwde,
+                       ppd->link_width_downgrade_tx_active,
+                       ppd->link_width_downgrade_rx_active);
+               do_bounce = 1;
+       }
+
+       if (do_bounce) {
+               set_link_down_reason(ppd, OPA_LINKDOWN_REASON_WIDTH_POLICY, 0,
+                 OPA_LINKDOWN_REASON_WIDTH_POLICY);
+               set_link_state(ppd, HLS_DN_OFFLINE);
+               start_link(ppd);
+       }
+}
+
+/*
+ * Handle a link downgrade interrupt from the 8051.
+ *
+ * This is a work-queue function outside of the interrupt.
+ */
+void handle_link_downgrade(struct work_struct *work)
+{
+       struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata,
+                                                       link_downgrade_work);
+
+       dd_dev_info(ppd->dd, "8051: Link width downgrade\n");
+       apply_link_downgrade_policy(ppd, 1);
+}
+
+static char *dcc_err_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags, dcc_err_flags,
+               ARRAY_SIZE(dcc_err_flags));
+}
+
+static char *lcb_err_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags, lcb_err_flags,
+               ARRAY_SIZE(lcb_err_flags));
+}
+
+static char *dc8051_err_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags, dc8051_err_flags,
+               ARRAY_SIZE(dc8051_err_flags));
+}
+
+static char *dc8051_info_err_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags, dc8051_info_err_flags,
+               ARRAY_SIZE(dc8051_info_err_flags));
+}
+
+static char *dc8051_info_host_msg_string(char *buf, int buf_len, u64 flags)
+{
+       return flag_string(buf, buf_len, flags, dc8051_info_host_msg_flags,
+               ARRAY_SIZE(dc8051_info_host_msg_flags));
+}
+
+static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       struct hfi1_pportdata *ppd = dd->pport;
+       u64 info, err, host_msg;
+       int queue_link_down = 0;
+       char buf[96];
+
+       /* look at the flags */
+       if (reg & DC_DC8051_ERR_FLG_SET_BY_8051_SMASK) {
+               /* 8051 information set by firmware */
+               /* read DC8051_DBG_ERR_INFO_SET_BY_8051 for details */
+               info = read_csr(dd, DC_DC8051_DBG_ERR_INFO_SET_BY_8051);
+               err = (info >> DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_SHIFT)
+                       & DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_MASK;
+               host_msg = (info >>
+                       DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_SHIFT)
+                       & DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_MASK;
+
+               /*
+                * Handle error flags.
+                */
+               if (err & FAILED_LNI) {
+                       /*
+                        * LNI error indications are cleared by the 8051
+                        * only when starting polling.  Only pay attention
+                        * to them when in the states that occur during
+                        * LNI.
+                        */
+                       if (ppd->host_link_state
+                           & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) {
+                               queue_link_down = 1;
+                               dd_dev_info(dd, "Link error: %s\n",
+                                       dc8051_info_err_string(buf,
+                                               sizeof(buf),
+                                               err & FAILED_LNI));
+                       }
+                       err &= ~(u64)FAILED_LNI;
+               }
+               if (err) {
+                       /* report remaining errors, but do not do anything */
+                       dd_dev_err(dd, "8051 info error: %s\n",
+                               dc8051_info_err_string(buf, sizeof(buf), err));
+               }
+
+               /*
+                * Handle host message flags.
+                */
+               if (host_msg & HOST_REQ_DONE) {
+                       /*
+                        * Presently, the driver does a busy wait for
+                        * host requests to complete.  This is only an
+                        * informational message.
+                        * NOTE: The 8051 clears the host message
+                        * information *on the next 8051 command*.
+                        * Therefore, when linkup is achieved,
+                        * this flag will still be set.
+                        */
+                       host_msg &= ~(u64)HOST_REQ_DONE;
+               }
+               if (host_msg & BC_SMA_MSG) {
+                       queue_work(ppd->hfi1_wq, &ppd->sma_message_work);
+                       host_msg &= ~(u64)BC_SMA_MSG;
+               }
+               if (host_msg & LINKUP_ACHIEVED) {
+                       dd_dev_info(dd, "8051: Link up\n");
+                       queue_work(ppd->hfi1_wq, &ppd->link_up_work);
+                       host_msg &= ~(u64)LINKUP_ACHIEVED;
+               }
+               if (host_msg & EXT_DEVICE_CFG_REQ) {
+                       handle_8051_request(dd);
+                       host_msg &= ~(u64)EXT_DEVICE_CFG_REQ;
+               }
+               if (host_msg & VERIFY_CAP_FRAME) {
+                       queue_work(ppd->hfi1_wq, &ppd->link_vc_work);
+                       host_msg &= ~(u64)VERIFY_CAP_FRAME;
+               }
+               if (host_msg & LINK_GOING_DOWN) {
+                       const char *extra = "";
+                       /* no downgrade action needed if going down */
+                       if (host_msg & LINK_WIDTH_DOWNGRADED) {
+                               host_msg &= ~(u64)LINK_WIDTH_DOWNGRADED;
+                               extra = " (ignoring downgrade)";
+                       }
+                       dd_dev_info(dd, "8051: Link down%s\n", extra);
+                       queue_link_down = 1;
+                       host_msg &= ~(u64)LINK_GOING_DOWN;
+               }
+               if (host_msg & LINK_WIDTH_DOWNGRADED) {
+                       queue_work(ppd->hfi1_wq, &ppd->link_downgrade_work);
+                       host_msg &= ~(u64)LINK_WIDTH_DOWNGRADED;
+               }
+               if (host_msg) {
+                       /* report remaining messages, but do not do anything */
+                       dd_dev_info(dd, "8051 info host message: %s\n",
+                               dc8051_info_host_msg_string(buf, sizeof(buf),
+                                       host_msg));
+               }
+
+               reg &= ~DC_DC8051_ERR_FLG_SET_BY_8051_SMASK;
+       }
+       if (reg & DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK) {
+               /*
+                * Lost the 8051 heartbeat.  If this happens, we
+                * receive constant interrupts about it.  Disable
+                * the interrupt after the first.
+                */
+               dd_dev_err(dd, "Lost 8051 heartbeat\n");
+               write_csr(dd, DC_DC8051_ERR_EN,
+                       read_csr(dd, DC_DC8051_ERR_EN)
+                         & ~DC_DC8051_ERR_EN_LOST_8051_HEART_BEAT_SMASK);
+
+               reg &= ~DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK;
+       }
+       if (reg) {
+               /* report the error, but do not do anything */
+               dd_dev_err(dd, "8051 error: %s\n",
+                       dc8051_err_string(buf, sizeof(buf), reg));
+       }
+
+       if (queue_link_down) {
+               /* if the link is already going down or disabled, do not
+                * queue another */
+               if ((ppd->host_link_state
+                                   & (HLS_GOING_OFFLINE|HLS_LINK_COOLDOWN))
+                               || ppd->link_enabled == 0) {
+                       dd_dev_info(dd, "%s: not queuing link down\n",
+                               __func__);
+               } else {
+                       queue_work(ppd->hfi1_wq, &ppd->link_down_work);
+               }
+       }
+}
+
+static const char * const fm_config_txt[] = {
+[0] =
+       "BadHeadDist: Distance violation between two head flits",
+[1] =
+       "BadTailDist: Distance violation between two tail flits",
+[2] =
+       "BadCtrlDist: Distance violation between two credit control flits",
+[3] =
+       "BadCrdAck: Credits return for unsupported VL",
+[4] =
+       "UnsupportedVLMarker: Received VL Marker",
+[5] =
+       "BadPreempt: Exceeded the preemption nesting level",
+[6] =
+       "BadControlFlit: Received unsupported control flit",
+/* no 7 */
+[8] =
+       "UnsupportedVLMarker: Received VL Marker for unconfigured or disabled VL",
+};
+
+static const char * const port_rcv_txt[] = {
+[1] =
+       "BadPktLen: Illegal PktLen",
+[2] =
+       "PktLenTooLong: Packet longer than PktLen",
+[3] =
+       "PktLenTooShort: Packet shorter than PktLen",
+[4] =
+       "BadSLID: Illegal SLID (0, using multicast as SLID, does not include security validation of SLID)",
+[5] =
+       "BadDLID: Illegal DLID (0, doesn't match HFI)",
+[6] =
+       "BadL2: Illegal L2 opcode",
+[7] =
+       "BadSC: Unsupported SC",
+[9] =
+       "BadRC: Illegal RC",
+[11] =
+       "PreemptError: Preempting with same VL",
+[12] =
+       "PreemptVL15: Preempting a VL15 packet",
+};
+
+#define OPA_LDR_FMCONFIG_OFFSET 16
+#define OPA_LDR_PORTRCV_OFFSET 0
+static void handle_dcc_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       u64 info, hdr0, hdr1;
+       const char *extra;
+       char buf[96];
+       struct hfi1_pportdata *ppd = dd->pport;
+       u8 lcl_reason = 0;
+       int do_bounce = 0;
+
+       if (reg & DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK) {
+               if (!(dd->err_info_uncorrectable & OPA_EI_STATUS_SMASK)) {
+                       info = read_csr(dd, DCC_ERR_INFO_UNCORRECTABLE);
+                       dd->err_info_uncorrectable = info & OPA_EI_CODE_SMASK;
+                       /* set status bit */
+                       dd->err_info_uncorrectable |= OPA_EI_STATUS_SMASK;
+               }
+               reg &= ~DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK;
+       }
+
+       if (reg & DCC_ERR_FLG_LINK_ERR_SMASK) {
+               struct hfi1_pportdata *ppd = dd->pport;
+               /* this counter saturates at (2^32) - 1 */
+               if (ppd->link_downed < (u32)UINT_MAX)
+                       ppd->link_downed++;
+               reg &= ~DCC_ERR_FLG_LINK_ERR_SMASK;
+       }
+
+       if (reg & DCC_ERR_FLG_FMCONFIG_ERR_SMASK) {
+               u8 reason_valid = 1;
+
+               info = read_csr(dd, DCC_ERR_INFO_FMCONFIG);
+               if (!(dd->err_info_fmconfig & OPA_EI_STATUS_SMASK)) {
+                       dd->err_info_fmconfig = info & OPA_EI_CODE_SMASK;
+                       /* set status bit */
+                       dd->err_info_fmconfig |= OPA_EI_STATUS_SMASK;
+               }
+               switch (info) {
+               case 0:
+               case 1:
+               case 2:
+               case 3:
+               case 4:
+               case 5:
+               case 6:
+                       extra = fm_config_txt[info];
+                       break;
+               case 8:
+                       extra = fm_config_txt[info];
+                       if (ppd->port_error_action &
+                           OPA_PI_MASK_FM_CFG_UNSUPPORTED_VL_MARKER) {
+                               do_bounce = 1;
+                               /*
+                                * lcl_reason cannot be derived from info
+                                * for this error
+                                */
+                               lcl_reason =
+                                 OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER;
+                       }
+                       break;
+               default:
+                       reason_valid = 0;
+                       snprintf(buf, sizeof(buf), "reserved%lld", info);
+                       extra = buf;
+                       break;
+               }
+
+               if (reason_valid && !do_bounce) {
+                       do_bounce = ppd->port_error_action &
+                                       (1 << (OPA_LDR_FMCONFIG_OFFSET + info));
+                       lcl_reason = info + OPA_LINKDOWN_REASON_BAD_HEAD_DIST;
+               }
+
+               /* just report this */
+               dd_dev_info(dd, "DCC Error: fmconfig error: %s\n", extra);
+               reg &= ~DCC_ERR_FLG_FMCONFIG_ERR_SMASK;
+       }
+
+       if (reg & DCC_ERR_FLG_RCVPORT_ERR_SMASK) {
+               u8 reason_valid = 1;
+
+               info = read_csr(dd, DCC_ERR_INFO_PORTRCV);
+               hdr0 = read_csr(dd, DCC_ERR_INFO_PORTRCV_HDR0);
+               hdr1 = read_csr(dd, DCC_ERR_INFO_PORTRCV_HDR1);
+               if (!(dd->err_info_rcvport.status_and_code &
+                     OPA_EI_STATUS_SMASK)) {
+                       dd->err_info_rcvport.status_and_code =
+                               info & OPA_EI_CODE_SMASK;
+                       /* set status bit */
+                       dd->err_info_rcvport.status_and_code |=
+                               OPA_EI_STATUS_SMASK;
+                       /* save first 2 flits in the packet that caused
+                        * the error */
+                        dd->err_info_rcvport.packet_flit1 = hdr0;
+                        dd->err_info_rcvport.packet_flit2 = hdr1;
+               }
+               switch (info) {
+               case 1:
+               case 2:
+               case 3:
+               case 4:
+               case 5:
+               case 6:
+               case 7:
+               case 9:
+               case 11:
+               case 12:
+                       extra = port_rcv_txt[info];
+                       break;
+               default:
+                       reason_valid = 0;
+                       snprintf(buf, sizeof(buf), "reserved%lld", info);
+                       extra = buf;
+                       break;
+               }
+
+               if (reason_valid && !do_bounce) {
+                       do_bounce = ppd->port_error_action &
+                                       (1 << (OPA_LDR_PORTRCV_OFFSET + info));
+                       lcl_reason = info + OPA_LINKDOWN_REASON_RCV_ERROR_0;
+               }
+
+               /* just report this */
+               dd_dev_info(dd, "DCC Error: PortRcv error: %s\n", extra);
+               dd_dev_info(dd, "           hdr0 0x%llx, hdr1 0x%llx\n",
+                       hdr0, hdr1);
+
+               reg &= ~DCC_ERR_FLG_RCVPORT_ERR_SMASK;
+       }
+
+       if (reg & DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK) {
+               /* informative only */
+               dd_dev_info(dd, "8051 access to LCB blocked\n");
+               reg &= ~DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK;
+       }
+       if (reg & DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK) {
+               /* informative only */
+               dd_dev_info(dd, "host access to LCB blocked\n");
+               reg &= ~DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK;
+       }
+
+       /* report any remaining errors */
+       if (reg)
+               dd_dev_info(dd, "DCC Error: %s\n",
+                       dcc_err_string(buf, sizeof(buf), reg));
+
+       if (lcl_reason == 0)
+               lcl_reason = OPA_LINKDOWN_REASON_UNKNOWN;
+
+       if (do_bounce) {
+               dd_dev_info(dd, "%s: PortErrorAction bounce\n", __func__);
+               set_link_down_reason(ppd, lcl_reason, 0, lcl_reason);
+               queue_work(ppd->hfi1_wq, &ppd->link_bounce_work);
+       }
+}
+
+static void handle_lcb_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
+{
+       char buf[96];
+
+       dd_dev_info(dd, "LCB Error: %s\n",
+               lcb_err_string(buf, sizeof(buf), reg));
+}
+
+/*
+ * CCE block DC interrupt.  Source is < 8.
+ */
+static void is_dc_int(struct hfi1_devdata *dd, unsigned int source)
+{
+       const struct err_reg_info *eri = &dc_errs[source];
+
+       if (eri->handler) {
+               interrupt_clear_down(dd, 0, eri);
+       } else if (source == 3 /* dc_lbm_int */) {
+               /*
+                * This indicates that a parity error has occurred on the
+                * address/control lines presented to the LBM.  The error
+                * is a single pulse, there is no associated error flag,
+                * and it is non-maskable.  This is because if a parity
+                * error occurs on the request the request is dropped.
+                * This should never occur, but it is nice to know if it
+                * ever does.
+                */
+               dd_dev_err(dd, "Parity error in DC LBM block\n");
+       } else {
+               dd_dev_err(dd, "Invalid DC interrupt %u\n", source);
+       }
+}
+
+/*
+ * TX block send credit interrupt.  Source is < 160.
+ */
+static void is_send_credit_int(struct hfi1_devdata *dd, unsigned int source)
+{
+       sc_group_release_update(dd, source);
+}
+
+/*
+ * TX block SDMA interrupt.  Source is < 48.
+ *
+ * SDMA interrupts are grouped by type:
+ *
+ *      0 -  N-1 = SDma
+ *      N - 2N-1 = SDmaProgress
+ *     2N - 3N-1 = SDmaIdle
+ */
+static void is_sdma_eng_int(struct hfi1_devdata *dd, unsigned int source)
+{
+       /* what interrupt */
+       unsigned int what  = source / TXE_NUM_SDMA_ENGINES;
+       /* which engine */
+       unsigned int which = source % TXE_NUM_SDMA_ENGINES;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+       dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", which,
+                  slashstrip(__FILE__), __LINE__, __func__);
+       sdma_dumpstate(&dd->per_sdma[which]);
+#endif
+
+       if (likely(what < 3 && which < dd->num_sdma)) {
+               sdma_engine_interrupt(&dd->per_sdma[which], 1ull << source);
+       } else {
+               /* should not happen */
+               dd_dev_err(dd, "Invalid SDMA interrupt 0x%x\n", source);
+       }
+}
+
+/*
+ * RX block receive available interrupt.  Source is < 160.
+ */
+static void is_rcv_avail_int(struct hfi1_devdata *dd, unsigned int source)
+{
+       struct hfi1_ctxtdata *rcd;
+       char *err_detail;
+
+       if (likely(source < dd->num_rcv_contexts)) {
+               rcd = dd->rcd[source];
+               if (rcd) {
+                       if (source < dd->first_user_ctxt)
+                               rcd->do_interrupt(rcd, 0);
+                       else
+                               handle_user_interrupt(rcd);
+                       return; /* OK */
+               }
+               /* received an interrupt, but no rcd */
+               err_detail = "dataless";
+       } else {
+               /* received an interrupt, but are not using that context */
+               err_detail = "out of range";
+       }
+       dd_dev_err(dd, "unexpected %s receive available context interrupt %u\n",
+               err_detail, source);
+}
+
+/*
+ * RX block receive urgent interrupt.  Source is < 160.
+ */
+static void is_rcv_urgent_int(struct hfi1_devdata *dd, unsigned int source)
+{
+       struct hfi1_ctxtdata *rcd;
+       char *err_detail;
+
+       if (likely(source < dd->num_rcv_contexts)) {
+               rcd = dd->rcd[source];
+               if (rcd) {
+                       /* only pay attention to user urgent interrupts */
+                       if (source >= dd->first_user_ctxt)
+                               handle_user_interrupt(rcd);
+                       return; /* OK */
+               }
+               /* received an interrupt, but no rcd */
+               err_detail = "dataless";
+       } else {
+               /* received an interrupt, but are not using that context */
+               err_detail = "out of range";
+       }
+       dd_dev_err(dd, "unexpected %s receive urgent context interrupt %u\n",
+               err_detail, source);
+}
+
+/*
+ * Reserved range interrupt.  Should not be called in normal operation.
+ */
+static void is_reserved_int(struct hfi1_devdata *dd, unsigned int source)
+{
+       char name[64];
+
+       dd_dev_err(dd, "unexpected %s interrupt\n",
+                               is_reserved_name(name, sizeof(name), source));
+}
+
+static const struct is_table is_table[] = {
+/* start                    end
+                               name func               interrupt func */
+{ IS_GENERAL_ERR_START,  IS_GENERAL_ERR_END,
+                               is_misc_err_name,       is_misc_err_int },
+{ IS_SDMAENG_ERR_START,  IS_SDMAENG_ERR_END,
+                               is_sdma_eng_err_name,   is_sdma_eng_err_int },
+{ IS_SENDCTXT_ERR_START, IS_SENDCTXT_ERR_END,
+                               is_sendctxt_err_name,   is_sendctxt_err_int },
+{ IS_SDMA_START,            IS_SDMA_END,
+                               is_sdma_eng_name,       is_sdma_eng_int },
+{ IS_VARIOUS_START,         IS_VARIOUS_END,
+                               is_various_name,        is_various_int },
+{ IS_DC_START,      IS_DC_END,
+                               is_dc_name,             is_dc_int },
+{ IS_RCVAVAIL_START,     IS_RCVAVAIL_END,
+                               is_rcv_avail_name,      is_rcv_avail_int },
+{ IS_RCVURGENT_START,    IS_RCVURGENT_END,
+                               is_rcv_urgent_name,     is_rcv_urgent_int },
+{ IS_SENDCREDIT_START,   IS_SENDCREDIT_END,
+                               is_send_credit_name,    is_send_credit_int},
+{ IS_RESERVED_START,     IS_RESERVED_END,
+                               is_reserved_name,       is_reserved_int},
+};
+
+/*
+ * Interrupt source interrupt - called when the given source has an interrupt.
+ * Source is a bit index into an array of 64-bit integers.
+ */
+static void is_interrupt(struct hfi1_devdata *dd, unsigned int source)
+{
+       const struct is_table *entry;
+
+       /* avoids a double compare by walking the table in-order */
+       for (entry = &is_table[0]; entry->is_name; entry++) {
+               if (source < entry->end) {
+                       trace_hfi1_interrupt(dd, entry, source);
+                       entry->is_int(dd, source - entry->start);
+                       return;
+               }
+       }
+       /* fell off the end */
+       dd_dev_err(dd, "invalid interrupt source %u\n", source);
+}
+
+/*
+ * General interrupt handler.  This is able to correctly handle
+ * all interrupts in case INTx is used.
+ */
+static irqreturn_t general_interrupt(int irq, void *data)
+{
+       struct hfi1_devdata *dd = data;
+       u64 regs[CCE_NUM_INT_CSRS];
+       u32 bit;
+       int i;
+
+       this_cpu_inc(*dd->int_counter);
+
+       /* phase 1: scan and clear all handled interrupts */
+       for (i = 0; i < CCE_NUM_INT_CSRS; i++) {
+               if (dd->gi_mask[i] == 0) {
+                       regs[i] = 0;    /* used later */
+                       continue;
+               }
+               regs[i] = read_csr(dd, CCE_INT_STATUS + (8 * i)) &
+                               dd->gi_mask[i];
+               /* only clear if anything is set */
+               if (regs[i])
+                       write_csr(dd, CCE_INT_CLEAR + (8 * i), regs[i]);
+       }
+
+       /* phase 2: call the appropriate handler */
+       for_each_set_bit(bit, (unsigned long *)&regs[0],
+                                               CCE_NUM_INT_CSRS*64) {
+               is_interrupt(dd, bit);
+       }
+
+       return IRQ_HANDLED;
+}
+
+static irqreturn_t sdma_interrupt(int irq, void *data)
+{
+       struct sdma_engine *sde = data;
+       struct hfi1_devdata *dd = sde->dd;
+       u64 status;
+
+#ifdef CONFIG_SDMA_VERBOSITY
+       dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
+                  slashstrip(__FILE__), __LINE__, __func__);
+       sdma_dumpstate(sde);
+#endif
+
+       this_cpu_inc(*dd->int_counter);
+
+       /* This read_csr is really bad in the hot path */
+       status = read_csr(dd,
+                       CCE_INT_STATUS + (8*(IS_SDMA_START/64)))
+                       & sde->imask;
+       if (likely(status)) {
+               /* clear the interrupt(s) */
+               write_csr(dd,
+                       CCE_INT_CLEAR + (8*(IS_SDMA_START/64)),
+                       status);
+
+               /* handle the interrupt(s) */
+               sdma_engine_interrupt(sde, status);
+       } else
+               dd_dev_err(dd, "SDMA engine %u interrupt, but no status bits set\n",
+                       sde->this_idx);
+
+       return IRQ_HANDLED;
+}
+
+/*
+ * Clear the receive interrupt, forcing the write and making sure
+ * we have data from the chip, pushing everything in front of it
+ * back to the host.
+ */
+static inline void clear_recv_intr(struct hfi1_ctxtdata *rcd)
+{
+       struct hfi1_devdata *dd = rcd->dd;
+       u32 addr = CCE_INT_CLEAR + (8 * rcd->ireg);
+
+       mmiowb();       /* make sure everything before is written */
+       write_csr(dd, addr, rcd->imask);
+       /* force the above write on the chip and get a value back */
+       (void)read_csr(dd, addr);
+}
+
+/* force the receive interrupt */
+static inline void force_recv_intr(struct hfi1_ctxtdata *rcd)
+{
+       write_csr(rcd->dd, CCE_INT_FORCE + (8 * rcd->ireg), rcd->imask);
+}
+
+/* return non-zero if a packet is present */
+static inline int check_packet_present(struct hfi1_ctxtdata *rcd)
+{
+       if (!HFI1_CAP_IS_KSET(DMA_RTAIL))
+               return (rcd->seq_cnt ==
+                               rhf_rcv_seq(rhf_to_cpu(get_rhf_addr(rcd))));
+
+       /* else is RDMA rtail */
+       return (rcd->head != get_rcvhdrtail(rcd));
+}
+
+/*
+ * Receive packet IRQ handler.  This routine expects to be on its own IRQ.
+ * This routine will try to handle packets immediately (latency), but if
+ * it finds too many, it will invoke the thread handler (bandwitdh).  The
+ * chip receive interupt is *not* cleared down until this or the thread (if
+ * invoked) is finished.  The intent is to avoid extra interrupts while we
+ * are processing packets anyway.
+ */
+static irqreturn_t receive_context_interrupt(int irq, void *data)
+{
+       struct hfi1_ctxtdata *rcd = data;
+       struct hfi1_devdata *dd = rcd->dd;
+       int disposition;
+       int present;
+
+       trace_hfi1_receive_interrupt(dd, rcd->ctxt);
+       this_cpu_inc(*dd->int_counter);
+
+       /* receive interrupt remains blocked while processing packets */
+       disposition = rcd->do_interrupt(rcd, 0);
+
+       /*
+        * Too many packets were seen while processing packets in this
+        * IRQ handler.  Invoke the handler thread.  The receive interrupt
+        * remains blocked.
+        */
+       if (disposition == RCV_PKT_LIMIT)
+               return IRQ_WAKE_THREAD;
+
+       /*
+        * The packet processor detected no more packets.  Clear the receive
+        * interrupt and recheck for a packet packet that may have arrived
+        * after the previous check and interrupt clear.  If a packet arrived,
+        * force another interrupt.
+        */
+       clear_recv_intr(rcd);
+       present = check_packet_present(rcd);
+       if (present)
+               force_recv_intr(rcd);
+
+       return IRQ_HANDLED;
+}
+
+/*
+ * Receive packet thread handler.  This expects to be invoked with the
+ * receive interrupt still blocked.
+ */
+static irqreturn_t receive_context_thread(int irq, void *data)
+{
+       struct hfi1_ctxtdata *rcd = data;
+       int present;
+
+       /* receive interrupt is still blocked from the IRQ handler */
+       (void)rcd->do_interrupt(rcd, 1);
+
+       /*
+        * The packet processor will only return if it detected no more
+        * packets.  Hold IRQs here so we can safely clear the interrupt and
+        * recheck for a packet that may have arrived after the previous
+        * check and the interrupt clear.  If a packet arrived, force another
+        * interrupt.
+        */
+       local_irq_disable();
+       clear_recv_intr(rcd);
+       present = check_packet_present(rcd);
+       if (present)
+               force_recv_intr(rcd);
+       local_irq_enable();
+
+       return IRQ_HANDLED;
+}
+
+/* ========================================================================= */
+
+u32 read_physical_state(struct hfi1_devdata *dd)
+{
+       u64 reg;
+
+       reg = read_csr(dd, DC_DC8051_STS_CUR_STATE);
+       return (reg >> DC_DC8051_STS_CUR_STATE_PORT_SHIFT)
+                               & DC_DC8051_STS_CUR_STATE_PORT_MASK;
+}
+
+static u32 read_logical_state(struct hfi1_devdata *dd)
+{
+       u64 reg;
+
+       reg = read_csr(dd, DCC_CFG_PORT_CONFIG);
+       return (reg >> DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT)
+                               & DCC_CFG_PORT_CONFIG_LINK_STATE_MASK;
+}
+
+static void set_logical_state(struct hfi1_devdata *dd, u32 chip_lstate)
+{
+       u64 reg;
+
+       reg = read_csr(dd, DCC_CFG_PORT_CONFIG);
+       /* clear current state, set new state */
+       reg &= ~DCC_CFG_PORT_CONFIG_LINK_STATE_SMASK;
+       reg |= (u64)chip_lstate << DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT;
+       write_csr(dd, DCC_CFG_PORT_CONFIG, reg);
+}
+
+/*
+ * Use the 8051 to read a LCB CSR.
+ */
+static int read_lcb_via_8051(struct hfi1_devdata *dd, u32 addr, u64 *data)
+{
+       u32 regno;
+       int ret;
+
+       if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
+               if (acquire_lcb_access(dd, 0) == 0) {
+                       *data = read_csr(dd, addr);
+                       release_lcb_access(dd, 0);
+                       return 0;
+               }
+               return -EBUSY;
+       }
+
+       /* register is an index of LCB registers: (offset - base) / 8 */
+       regno = (addr - DC_LCB_CFG_RUN) >> 3;
+       ret = do_8051_command(dd, HCMD_READ_LCB_CSR, regno, data);
+       if (ret != HCMD_SUCCESS)
+               return -EBUSY;
+       return 0;
+}
+
+/*
+ * Read an LCB CSR.  Access may not be in host control, so check.
+ * Return 0 on success, -EBUSY on failure.
+ */
+int read_lcb_csr(struct hfi1_devdata *dd, u32 addr, u64 *data)
+{
+       struct hfi1_pportdata *ppd = dd->pport;
+
+       /* if up, go through the 8051 for the value */
+       if (ppd->host_link_state & HLS_UP)
+               return read_lcb_via_8051(dd, addr, data);
+       /* if going up or down, no access */
+       if (ppd->host_link_state & (HLS_GOING_UP | HLS_GOING_OFFLINE))
+               return -EBUSY;
+       /* otherwise, host has access */
+       *data = read_csr(dd, addr);
+       return 0;
+}
+
+/*
+ * Use the 8051 to write a LCB CSR.
+ */
+static int write_lcb_via_8051(struct hfi1_devdata *dd, u32 addr, u64 data)
+{
+
+       if (acquire_lcb_access(dd, 0) == 0) {
+               write_csr(dd, addr, data);
+               release_lcb_access(dd, 0);
+               return 0;
+       }
+       return -EBUSY;
+}
+
+/*
+ * Write an LCB CSR.  Access may not be in host control, so check.
+ * Return 0 on success, -EBUSY on failure.
+ */
+int write_lcb_csr(struct hfi1_devdata *dd, u32 addr, u64 data)
+{
+       struct hfi1_pportdata *ppd = dd->pport;
+
+       /* if up, go through the 8051 for the value */
+       if (ppd->host_link_state & HLS_UP)
+               return write_lcb_via_8051(dd, addr, data);
+       /* if going up or down, no access */
+       if (ppd->host_link_state & (HLS_GOING_UP | HLS_GOING_OFFLINE))
+               return -EBUSY;
+       /* otherwise, host has access */
+       write_csr(dd, addr, data);
+       return 0;
+}
+
+/*
+ * Returns:
+ *     < 0 = Linux error, not able to get access
+ *     > 0 = 8051 command RETURN_CODE
+ */
+static int do_8051_command(
+       struct hfi1_devdata *dd,
+       u32 type,
+       u64 in_data,
+       u64 *out_data)
+{
+       u64 reg, completed;
+       int return_code;
+       unsigned long flags;
+       unsigned long timeout;
+
+       hfi1_cdbg(DC8051, "type %d, data 0x%012llx", type, in_data);
+
+       /*
+        * Alternative to holding the lock for a long time:
+        * - keep busy wait - have other users bounce off
+        */
+       spin_lock_irqsave(&dd->dc8051_lock, flags);
+
+       /* We can't send any commands to the 8051 if it's in reset */
+       if (dd->dc_shutdown) {
+               return_code = -ENODEV;
+               goto fail;
+       }
+
+       /*
+        * If an 8051 host command timed out previously, then the 8051 is
+        * stuck.
+        *
+        * On first timeout, attempt to reset and restart the entire DC
+        * block (including 8051). (Is this too big of a hammer?)
+        *
+        * If the 8051 times out a second time, the reset did not bring it
+        * back to healthy life. In that case, fail any subsequent commands.
+        */
+       if (dd->dc8051_timed_out) {
+               if (dd->dc8051_timed_out > 1) {
+                       dd_dev_err(dd,
+                                  "Previous 8051 host command timed out, skipping command %u\n",
+                                  type);
+                       return_code = -ENXIO;
+                       goto fail;
+               }
+               spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+               dc_shutdown(dd);
+               dc_start(dd);
+               spin_lock_irqsave(&dd->dc8051_lock, flags);
+       }
+
+       /*
+        * If there is no timeout, then the 8051 command interface is
+        * waiting for a command.
+        */
+
+       /*
+        * Do two writes: the first to stabilize the type and req_data, the
+        * second to activate.
+        */
+       reg = ((u64)type & DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_MASK)
+                       << DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_SHIFT
+               | (in_data & DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_MASK)
+                       << DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_SHIFT;
+       write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, reg);
+       reg |= DC_DC8051_CFG_HOST_CMD_0_REQ_NEW_SMASK;
+       write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, reg);
+
+       /* wait for completion, alternate: interrupt */
+       timeout = jiffies + msecs_to_jiffies(DC8051_COMMAND_TIMEOUT);
+       while (1) {
+               reg = read_csr(dd, DC_DC8051_CFG_HOST_CMD_1);
+               completed = reg & DC_DC8051_CFG_HOST_CMD_1_COMPLETED_SMASK;
+               if (completed)
+                       break;
+               if (time_after(jiffies, timeout)) {
+                       dd->dc8051_timed_out++;
+                       dd_dev_err(dd, "8051 host command %u timeout\n", type);
+                       if (out_data)
+                               *out_data = 0;
+                       return_code = -ETIMEDOUT;
+                       goto fail;
+               }
+               udelay(2);
+       }
+
+       if (out_data) {
+               *out_data = (reg >> DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_SHIFT)
+                               & DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_MASK;
+               if (type == HCMD_READ_LCB_CSR) {
+                       /* top 16 bits are in a different register */
+                       *out_data |= (read_csr(dd, DC_DC8051_CFG_EXT_DEV_1)
+                               & DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SMASK)
+                               << (48
+                                   - DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT);
+               }
+       }
+       return_code = (reg >> DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_SHIFT)
+                               & DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_MASK;
+       dd->dc8051_timed_out = 0;
+       /*
+        * Clear command for next user.
+        */
+       write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, 0);
+
+fail:
+       spin_unlock_irqrestore(&dd->dc8051_lock, flags);
+
+       return return_code;
+}
+
+static int set_physical_link_state(struct hfi1_devdata *dd, u64 state)
+{
+       return do_8051_command(dd, HCMD_CHANGE_PHY_STATE, state, NULL);
+}
+
+static int load_8051_config(struct hfi1_devdata *dd, u8 field_id,
+                           u8 lane_id, u32 config_data)
+{
+       u64 data;
+       int ret;
+
+       data = (u64)field_id << LOAD_DATA_FIELD_ID_SHIFT
+               | (u64)lane_id << LOAD_DATA_LANE_ID_SHIFT
+               | (u64)config_data << LOAD_DATA_DATA_SHIFT;
+       ret = do_8051_command(dd, HCMD_LOAD_CONFIG_DATA, data, NULL);
+       if (ret != HCMD_SUCCESS) {
+               dd_dev_err(dd,
+                       "load 8051 config: field id %d, lane %d, err %d\n",
+                       (int)field_id, (int)lane_id, ret);
+       }
+       return ret;
+}
+
+/*
+ * Read the 8051 firmware "registers".  Use the RAM directly.  Always
+ * set the result, even on error.
+ * Return 0 on success, -errno on failure
+ */
+static int read_8051_config(struct hfi1_devdata *dd, u8 field_id, u8 lane_id,
+                           u32 *result)
+{
+       u64 big_data;
+       u32 addr;
+       int ret;
+
+       /* address start depends on the lane_id */
+       if (lane_id < 4)
+               addr = (4 * NUM_GENERAL_FIELDS)
+                       + (lane_id * 4 * NUM_LANE_FIELDS);
+       else
+               addr = 0;
+       addr += field_id * 4;
+
+       /* read is in 8-byte chunks, hardware will truncate the address down */
+       ret = read_8051_data(dd, addr, 8, &big_data);
+
+       if (ret == 0) {
+               /* extract the 4 bytes we want */
+               if (addr & 0x4)
+                       *result = (u32)(big_data >> 32);
+               else
+                       *result = (u32)big_data;
+       } else {
+               *result = 0;
+               dd_dev_err(dd, "%s: direct read failed, lane %d, field %d!\n",
+                       __func__, lane_id, field_id);
+       }
+
+       return ret;
+}
+
+static int write_vc_local_phy(struct hfi1_devdata *dd, u8 power_management,
+                             u8 continuous)
+{
+       u32 frame;
+
+       frame = continuous << CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT
+               | power_management << POWER_MANAGEMENT_SHIFT;
+       return load_8051_config(dd, VERIFY_CAP_LOCAL_PHY,
+                               GENERAL_CONFIG, frame);
+}
+
+static int write_vc_local_fabric(struct hfi1_devdata *dd, u8 vau, u8 z, u8 vcu,
+                                u16 vl15buf, u8 crc_sizes)
+{
+       u32 frame;
+
+       frame = (u32)vau << VAU_SHIFT
+               | (u32)z << Z_SHIFT
+               | (u32)vcu << VCU_SHIFT
+               | (u32)vl15buf << VL15BUF_SHIFT
+               | (u32)crc_sizes << CRC_SIZES_SHIFT;
+       return load_8051_config(dd, VERIFY_CAP_LOCAL_FABRIC,
+                               GENERAL_CONFIG, frame);
+}
+
+static void read_vc_local_link_width(struct hfi1_devdata *dd, u8 *misc_bits,
+                                    u8 *flag_bits, u16 *link_widths)
+{
+       u32 frame;
+
+       read_8051_config(dd, VERIFY_CAP_LOCAL_LINK_WIDTH, GENERAL_CONFIG,
+                               &frame);
+       *misc_bits = (frame >> MISC_CONFIG_BITS_SHIFT) & MISC_CONFIG_BITS_MASK;
+       *flag_bits = (frame >> LOCAL_FLAG_BITS_SHIFT) & LOCAL_FLAG_BITS_MASK;
+       *link_widths = (frame >> LINK_WIDTH_SHIFT) & LINK_WIDTH_MASK;
+}
+
+static int write_vc_local_link_width(struct hfi1_devdata *dd,
+                                    u8 misc_bits,
+                                    u8 flag_bits,
+                                    u16 link_widths)
+{
+       u32 frame;
+
+       frame = (u32)misc_bits << MISC_CONFIG_BITS_SHIFT
+               | (u32)flag_bits << LOCAL_FLAG_BITS_SHIFT
+               | (u32)link_widths << LINK_WIDTH_SHIFT;
+       return load_8051_config(dd, VERIFY_CAP_LOCAL_LINK_WIDTH, GENERAL_CONFIG,
+                    frame);
+}
+
+static int write_local_device_id(struct hfi1_devdata *dd, u16 device_id,
+                                u8 device_rev)
+{
+       u32 frame;
+
+       frame = ((u32)device_id << LOCAL_DEVICE_ID_SHIFT)
+               | ((u32)device_rev << LOCAL_DEVICE_REV_SHIFT);
+       return load_8051_config(dd, LOCAL_DEVICE_ID, GENERAL_CONFIG, frame);
+}
+
+static void read_remote_device_id(struct hfi1_devdata *dd, u16 *device_id,
+                                 u8 *device_rev)
+{
+       u32 frame;
+
+       read_8051_config(dd, REMOTE_DEVICE_ID, GENERAL_CONFIG, &frame);
+       *device_id = (frame >> REMOTE_DEVICE_ID_SHIFT) & REMOTE_DEVICE_ID_MASK;
+       *device_rev = (frame >> REMOTE_DEVICE_REV_SHIFT)
+                       & REMOTE_DEVICE_REV_MASK;
+}
+
+void read_misc_status(struct hfi1_devdata *dd, u8 *ver_a, u8 *ver_b)
+{
+       u32 frame;
+
+       read_8051_config(dd, MISC_STATUS, GENERAL_CONFIG, &frame);
+       *ver_a = (frame >> STS_FM_VERSION_A_SHIFT) & STS_FM_VERSION_A_MASK;
+       *ver_b = (frame >> STS_FM_VERSION_B_SHIFT) & STS_FM_VERSION_B_MASK;
+}
+
+static void read_vc_remote_phy(struct hfi1_devdata *dd, u8 *power_management,
+                              u8 *continuous)
+{
+       u32 frame;
+
+       read_8051_config(dd, VERIFY_CAP_REMOTE_PHY, GENERAL_CONFIG, &frame);
+       *power_management = (frame >> POWER_MANAGEMENT_SHIFT)
+                                       & POWER_MANAGEMENT_MASK;
+       *continuous = (frame >> CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT)
+                                       & CONTINIOUS_REMOTE_UPDATE_SUPPORT_MASK;
+}
+
+static void read_vc_remote_fabric(struct hfi1_devdata *dd, u8 *vau, u8 *z,
+                                 u8 *vcu, u16 *vl15buf, u8 *crc_sizes)
+{
+       u32 frame;
+
+       read_8051_config(dd, VERIFY_CAP_REMOTE_FABRIC, GENERAL_CONFIG, &frame);
+       *vau = (frame >> VAU_SHIFT) & VAU_MASK;
+       *z = (frame >> Z_SHIFT) & Z_MASK;
+       *vcu = (frame >> VCU_SHIFT) & VCU_MASK;
+       *vl15buf = (frame >> VL15BUF_SHIFT) & VL15BUF_MASK;
+       *crc_sizes = (frame >> CRC_SIZES_SHIFT) & CRC_SIZES_MASK;
+}
+
+static void read_vc_remote_link_width(struct hfi1_devdata *dd,
+                                     u8 *remote_tx_rate,
+                                     u16 *link_widths)
+{
+       u32 frame;
+
+       read_8051_config(dd, VERIFY_CAP_REMOTE_LINK_WIDTH, GENERAL_CONFIG,
+                               &frame);
+       *remote_tx_rate = (frame >> REMOTE_TX_RATE_SHIFT)
+                               & REMOTE_TX_RATE_MASK;
+       *link_widths = (frame >> LINK_WIDTH_SHIFT) & LINK_WIDTH_MASK;
+}
+
+static void read_local_lni(struct hfi1_devdata *dd, u8 *enable_lane_rx)
+{
+       u32 frame;
+
+       read_8051_config(dd, LOCAL_LNI_INFO, GENERAL_CONFIG, &frame);
+       *enable_lane_rx = (frame >> ENABLE_LANE_RX_SHIFT) & ENABLE_LANE_RX_MASK;
+}
+
+static void read_mgmt_allowed(struct hfi1_devdata *dd, u8 *mgmt_allowed)
+{
+       u32 frame;
+
+       read_8051_config(dd, REMOTE_LNI_INFO, GENERAL_CONFIG, &frame);
+       *mgmt_allowed = (frame >> MGMT_ALLOWED_SHIFT) & MGMT_ALLOWED_MASK;
+}
+
+static void read_last_local_state(struct hfi1_devdata *dd, u32 *lls)
+{
+       read_8051_config(dd, LAST_LOCAL_STATE_COMPLETE, GENERAL_CONFIG, lls);
+}
+
+static void read_last_remote_state(struct hfi1_devdata *dd, u32 *lrs)
+{
+       read_8051_config(dd, LAST_REMOTE_STATE_COMPLETE, GENERAL_CONFIG, lrs);
+}
+
+void hfi1_read_link_quality(struct hfi1_devdata *dd, u8 *link_quality)
+{
+       u32 frame;
+       int ret;
+
+       *link_quality = 0;
+       if (dd->pport->host_link_state & HLS_UP) {
+               ret = read_8051_config(dd, LINK_QUALITY_INFO, GENERAL_CONFIG,
+                                       &frame);
+               if (ret == 0)
+                       *link_quality = (frame >> LINK_QUALITY_SHIFT)
+                                               & LINK_QUALITY_MASK;
+       }
+}
+
+static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc)
+{
+       u32 frame;
+
+       read_8051_config(dd, LINK_QUALITY_INFO, GENERAL_CONFIG, &frame);
+       *pdrrc = (frame >> DOWN_REMOTE_REASON_SHIFT) & DOWN_REMOTE_REASON_MASK;
+}
+
+static int read_tx_settings(struct hfi1_devdata *dd,
+                           u8 *enable_lane_tx,
+                           u8 *tx_polarity_inversion,
+                           u8 *rx_polarity_inversion,
+                           u8 *max_rate)
+{
+       u32 frame;
+       int ret;
+
+       ret = read_8051_config(dd, TX_SETTINGS, GENERAL_CONFIG, &frame);
+       *enable_lane_tx = (frame >> ENABLE_LANE_TX_SHIFT)
+                               & ENABLE_LANE_TX_MASK;
+       *tx_polarity_inversion = (frame >> TX_POLARITY_INVERSION_SHIFT)
+                               & TX_POLARITY_INVERSION_MASK;
+       *rx_polarity_inversion = (frame >> RX_POLARITY_INVERSION_SHIFT)
+                               & RX_POLARITY_INVERSION_MASK;
+       *max_rate = (frame >> MAX_RATE_SHIFT) & MAX_RATE_MASK;
+       return ret;
+}
+
+static int write_tx_settings(struct hfi1_devdata *dd,
+                            u8 enable_lane_tx,
+                            u8 tx_polarity_inversion,
+                            u8 rx_polarity_inversion,
+                            u8 max_rate)
+{
+       u32 frame;
+
+       /* no need to mask, all variable sizes match field widths */
+       frame = enable_lane_tx << ENABLE_LANE_TX_SHIFT
+               | tx_polarity_inversion << TX_POLARITY_INVERSION_SHIFT
+               | rx_polarity_inversion << RX_POLARITY_INVERSION_SHIFT
+               | max_rate << MAX_RATE_SHIFT;
+       return load_8051_config(dd, TX_SETTINGS, GENERAL_CONFIG, frame);
+}
+
+static void check_fabric_firmware_versions(struct hfi1_devdata *dd)
+{
+       u32 frame, version, prod_id;
+       int ret, lane;
+
+       /* 4 lanes */
+       for (lane = 0; lane < 4; lane++) {
+               ret = read_8051_config(dd, SPICO_FW_VERSION, lane, &frame);
+               if (ret) {
+                       dd_dev_err(
+                               dd,
+                               "Unable to read lane %d firmware details\n",
+                               lane);
+                       continue;
+               }
+               version = (frame >> SPICO_ROM_VERSION_SHIFT)
+                                       & SPICO_ROM_VERSION_MASK;
+               prod_id = (frame >> SPICO_ROM_PROD_ID_SHIFT)
+                                       & SPICO_ROM_PROD_ID_MASK;
+               dd_dev_info(dd,
+                       "Lane %d firmware: version 0x%04x, prod_id 0x%04x\n",
+                       lane, version, prod_id);
+       }
+}
+
+/*
+ * Read an idle LCB message.
+ *
+ * Returns 0 on success, -EINVAL on error
+ */
+static int read_idle_message(struct hfi1_devdata *dd, u64 type, u64 *data_out)
+{
+       int ret;
+
+       ret = do_8051_command(dd, HCMD_READ_LCB_IDLE_MSG,
+               type, data_out);
+       if (ret != HCMD_SUCCESS) {
+               dd_dev_err(dd, "read idle message: type %d, err %d\n",
+                       (u32)type, ret);
+               return -EINVAL;
+       }
+       dd_dev_info(dd, "%s: read idle message 0x%llx\n", __func__, *data_out);
+       /* return only the payload as we already know the type */
+       *data_out >>= IDLE_PAYLOAD_SHIFT;
+       return 0;
+}
+
+/*
+ * Read an idle SMA message.  To be done in response to a notification from
+ * the 8051.
+ *
+ * Returns 0 on success, -EINVAL on error
+ */
+static int read_idle_sma(struct hfi1_devdata *dd, u64 *data)
+{
+       return read_idle_message(dd,
+                       (u64)IDLE_SMA << IDLE_MSG_TYPE_SHIFT, data);
+}
+
+/*
+ * Send an idle LCB message.
+ *
+ * Returns 0 on success, -EINVAL on error
+ */
+static int send_idle_message(struct hfi1_devdata *dd, u64 data)
+{
+       int ret;
+
+       dd_dev_info(dd, "%s: sending idle message 0x%llx\n", __func__, data);
+       ret = do_8051_command(dd, HCMD_SEND_LCB_IDLE_MSG, data, NULL);
+       if (ret != HCMD_SUCCESS) {
+               dd_dev_err(dd, "send idle message: data 0x%llx, err %d\n",
+                       data, ret);
+               return -EINVAL;
+       }
+       return 0;
+}
+
+/*
+ * Send an idle SMA message.
+ *
+ * Returns 0 on success, -EINVAL on error
+ */
+int send_idle_sma(struct hfi1_devdata *dd, u64 message)
+{
+       u64 data;
+
+       data = ((message & IDLE_PAYLOAD_MASK) << IDLE_PAYLOAD_SHIFT)
+               | ((u64)IDLE_SMA << IDLE_MSG_TYPE_SHIFT);
+       return send_idle_message(dd, data);
+}
+
+/*
+ * Initialize the LCB then do a quick link up.  This may or may not be
+ * in loopback.
+ *
+ * return 0 on success, -errno on error
+ */
+static int do_quick_linkup(struct hfi1_devdata *dd)
+{
+       u64 reg;
+       unsigned long timeout;
+       int ret;
+
+       lcb_shutdown(dd, 0);
+
+       if (loopback) {
+               /* LCB_CFG_LOOPBACK.VAL = 2 */
+               /* LCB_CFG_LANE_WIDTH.VAL = 0 */
+               write_csr(dd, DC_LCB_CFG_LOOPBACK,
+                       IB_PACKET_TYPE << DC_LCB_CFG_LOOPBACK_VAL_SHIFT);
+               write_csr(dd, DC_LCB_CFG_LANE_WIDTH, 0);
+       }
+
+       /* start the LCBs */
+       /* LCB_CFG_TX_FIFOS_RESET.VAL = 0 */
+       write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0);
+
+       /* simulator only loopback steps */
+       if (loopback && dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
+               /* LCB_CFG_RUN.EN = 1 */
+               write_csr(dd, DC_LCB_CFG_RUN,
+                       1ull << DC_LCB_CFG_RUN_EN_SHIFT);
+
+               /* watch LCB_STS_LINK_TRANSFER_ACTIVE */
+               timeout = jiffies + msecs_to_jiffies(10);
+               while (1) {
+                       reg = read_csr(dd,
+                               DC_LCB_STS_LINK_TRANSFER_ACTIVE);
+                       if (reg)
+                               break;
+                       if (time_after(jiffies, timeout)) {
+                               dd_dev_err(dd,
+                                       "timeout waiting for LINK_TRANSFER_ACTIVE\n");
+                               return -ETIMEDOUT;
+                       }
+                       udelay(2);
+               }
+
+               write_csr(dd, DC_LCB_CFG_ALLOW_LINK_UP,
+                       1ull << DC_LCB_CFG_ALLOW_LINK_UP_VAL_SHIFT);
+       }
+
+       if (!loopback) {
+               /*
+                * When doing quick linkup and not in loopback, both
+                * sides must be done with LCB set-up before either
+                * starts the quick linkup.  Put a delay here so that
+                * both sides can be started and have a chance to be
+                * done with LCB set up before resuming.
+                */
+               dd_dev_err(dd,
+                       "Pausing for peer to be finished with LCB set up\n");
+               msleep(5000);
+               dd_dev_err(dd,
+                       "Continuing with quick linkup\n");
+       }
+
+       write_csr(dd, DC_LCB_ERR_EN, 0); /* mask LCB errors */
+       set_8051_lcb_access(dd);
+
+       /*
+        * State "quick" LinkUp request sets the physical link state to
+        * LinkUp without a verify capability sequence.
+        * This state is in simulator v37 and later.
+        */
+       ret = set_physical_link_state(dd, PLS_QUICK_LINKUP);
+       if (ret != HCMD_SUCCESS) {
+               dd_dev_err(dd,
+                       "%s: set physical link state to quick LinkUp failed with return %d\n",
+                       __func__, ret);
+
+               set_host_lcb_access(dd);
+               write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */
+
+               if (ret >= 0)
+                       ret = -EINVAL;
+               return ret;
+       }
+
+       return 0; /* success */
+}
+
+/*
+ * Set the SerDes to internal loopback mode.
+ * Returns 0 on success, -errno on error.
+ */
+static int set_serdes_loopback_mode(struct hfi1_devdata *dd)
+{
+       int ret;
+
+       ret = set_physical_link_state(dd, PLS_INTERNAL_SERDES_LOOPBACK);
+       if (ret == HCMD_SUCCESS)
+               return 0;
+       dd_dev_err(dd,
+               "Set physical link state to SerDes Loopback failed with return %d\n",
+               ret);
+       if (ret >= 0)
+               ret = -EINVAL;
+       return ret;
+}
+
+/*
+ * Do all special steps to set up loopback.
+ */
+static int init_loopback(struct hfi1_devdata *dd)
+{
+       dd_dev_info(dd, "Entering loopback mode\n");
+
+       /* all loopbacks should disable self GUID check */
+       write_csr(dd, DC_DC8051_CFG_MODE,
+               (read_csr(dd, DC_DC8051_CFG_MODE) | DISABLE_SELF_GUID_CHECK));
+
+       /*
+        * The simulator has only one loopback option - LCB.  Switch
+        * to that option, which includes quick link up.
+        *
+        * Accept all valid loopback values.
+        */
+       if ((dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
+               && (loopback == LOOPBACK_SERDES
+                       || loopback == LOOPBACK_LCB
+                       || loopback == LOOPBACK_CABLE)) {
+               loopback = LOOPBACK_LCB;
+               quick_linkup = 1;
+               return 0;
+       }
+
+       /* handle serdes loopback */
+       if (loopback == LOOPBACK_SERDES) {
+               /* internal serdes loopack needs quick linkup on RTL */
+               if (dd->icode == ICODE_RTL_SILICON)
+                       quick_linkup = 1;
+               return set_serdes_loopback_mode(dd);
+       }
+
+       /* LCB loopback - handled at poll time */
+       if (loopback == LOOPBACK_LCB) {
+               quick_linkup = 1; /* LCB is always quick linkup */
+
+               /* not supported in emulation due to emulation RTL changes */
+               if (dd->icode == ICODE_FPGA_EMULATION) {
+                       dd_dev_err(dd,
+                               "LCB loopback not supported in emulation\n");
+                       return -EINVAL;
+               }
+               return 0;
+       }
+
+       /* external cable loopback requires no extra steps */
+       if (loopback == LOOPBACK_CABLE)
+               return 0;
+
+       dd_dev_err(dd, "Invalid loopback mode %d\n", loopback);
+       return -EINVAL;
+}
+
+/*
+ * Translate from the OPA_LINK_WIDTH handed to us by the FM to bits
+ * used in the Verify Capability link width attribute.
+ */
+static u16 opa_to_vc_link_widths(u16 opa_widths)
+{
+       int i;
+       u16 result = 0;
+
+       static const struct link_bits {
+               u16 from;
+               u16 to;
+       } opa_link_xlate[] = {
+               { OPA_LINK_WIDTH_1X, 1 << (1-1)  },
+               { OPA_LINK_WIDTH_2X, 1 << (2-1)  },
+               { OPA_LINK_WIDTH_3X, 1 << (3-1)  },
+               { OPA_LINK_WIDTH_4X, 1 << (4-1)  },
+       };
+
+       for (i = 0; i < ARRAY_SIZE(opa_link_xlate); i++) {
+               if (opa_widths & opa_link_xlate[i].from)
+                       result |= opa_link_xlate[i].to;
+       }
+       return result;
+}
+
+/*
+ * Set link attributes before moving to polling.
+ */
+static int set_local_link_attributes(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u8 enable_lane_tx;
+       u8 tx_polarity_inversion;
+       u8 rx_polarity_inversion;
+       int ret;
+
+       /* reset our fabric serdes to clear any lingering problems */
+       fabric_serdes_reset(dd);
+
+       /* set the local tx rate - need to read-modify-write */
+       ret = read_tx_settings(dd, &enable_lane_tx, &tx_polarity_inversion,
+               &rx_polarity_inversion, &ppd->local_tx_rate);
+       if (ret)
+               goto set_local_link_attributes_fail;
+
+       if (dd->dc8051_ver < dc8051_ver(0, 20)) {
+               /* set the tx rate to the fastest enabled */
+               if (ppd->link_speed_enabled & OPA_LINK_SPEED_25G)
+                       ppd->local_tx_rate = 1;
+               else
+                       ppd->local_tx_rate = 0;
+       } else {
+               /* set the tx rate to all enabled */
+               ppd->local_tx_rate = 0;
+               if (ppd->link_speed_enabled & OPA_LINK_SPEED_25G)
+                       ppd->local_tx_rate |= 2;
+               if (ppd->link_speed_enabled & OPA_LINK_SPEED_12_5G)
+                       ppd->local_tx_rate |= 1;
+       }
+
+       enable_lane_tx = 0xF; /* enable all four lanes */
+       ret = write_tx_settings(dd, enable_lane_tx, tx_polarity_inversion,
+                    rx_polarity_inversion, ppd->local_tx_rate);
+       if (ret != HCMD_SUCCESS)
+               goto set_local_link_attributes_fail;
+
+       /*
+        * DC supports continuous updates.
+        */
+       ret = write_vc_local_phy(dd, 0 /* no power management */,
+                                    1 /* continuous updates */);
+       if (ret != HCMD_SUCCESS)
+               goto set_local_link_attributes_fail;
+
+       /* z=1 in the next call: AU of 0 is not supported by the hardware */
+       ret = write_vc_local_fabric(dd, dd->vau, 1, dd->vcu, dd->vl15_init,
+                                   ppd->port_crc_mode_enabled);
+       if (ret != HCMD_SUCCESS)
+               goto set_local_link_attributes_fail;
+
+       ret = write_vc_local_link_width(dd, 0, 0,
+                    opa_to_vc_link_widths(ppd->link_width_enabled));
+       if (ret != HCMD_SUCCESS)
+               goto set_local_link_attributes_fail;
+
+       /* let peer know who we are */
+       ret = write_local_device_id(dd, dd->pcidev->device, dd->minrev);
+       if (ret == HCMD_SUCCESS)
+               return 0;
+
+set_local_link_attributes_fail:
+       dd_dev_err(dd,
+               "Failed to set local link attributes, return 0x%x\n",
+               ret);
+       return ret;
+}
+
+/*
+ * Call this to start the link.  Schedule a retry if the cable is not
+ * present or if unable to start polling.  Do not do anything if the
+ * link is disabled.  Returns 0 if link is disabled or moved to polling
+ */
+int start_link(struct hfi1_pportdata *ppd)
+{
+       if (!ppd->link_enabled) {
+               dd_dev_info(ppd->dd,
+                       "%s: stopping link start because link is disabled\n",
+                       __func__);
+               return 0;
+       }
+       if (!ppd->driver_link_ready) {
+               dd_dev_info(ppd->dd,
+                       "%s: stopping link start because driver is not ready\n",
+                       __func__);
+               return 0;
+       }
+
+       if (qsfp_mod_present(ppd) || loopback == LOOPBACK_SERDES ||
+                       loopback == LOOPBACK_LCB ||
+                       ppd->dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
+               return set_link_state(ppd, HLS_DN_POLL);
+
+       dd_dev_info(ppd->dd,
+               "%s: stopping link start because no cable is present\n",
+               __func__);
+       return -EAGAIN;
+}
+
+static void reset_qsfp(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u64 mask, qsfp_mask;
+
+       mask = (u64)QSFP_HFI0_RESET_N;
+       qsfp_mask = read_csr(dd,
+               dd->hfi1_id ? ASIC_QSFP2_OE : ASIC_QSFP1_OE);
+       qsfp_mask |= mask;
+       write_csr(dd,
+               dd->hfi1_id ? ASIC_QSFP2_OE : ASIC_QSFP1_OE,
+               qsfp_mask);
+
+       qsfp_mask = read_csr(dd,
+               dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT);
+       qsfp_mask &= ~mask;
+       write_csr(dd,
+               dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT,
+               qsfp_mask);
+
+       udelay(10);
+
+       qsfp_mask |= mask;
+       write_csr(dd,
+               dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT,
+               qsfp_mask);
+}
+
+static int handle_qsfp_error_conditions(struct hfi1_pportdata *ppd,
+                                       u8 *qsfp_interrupt_status)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+
+       if ((qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_ALARM) ||
+               (qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_WARNING))
+               dd_dev_info(dd,
+                       "%s: QSFP cable on fire\n",
+                       __func__);
+
+       if ((qsfp_interrupt_status[0] & QSFP_LOW_TEMP_ALARM) ||
+               (qsfp_interrupt_status[0] & QSFP_LOW_TEMP_WARNING))
+               dd_dev_info(dd,
+                       "%s: QSFP cable temperature too low\n",
+                       __func__);
+
+       if ((qsfp_interrupt_status[1] & QSFP_HIGH_VCC_ALARM) ||
+               (qsfp_interrupt_status[1] & QSFP_HIGH_VCC_WARNING))
+               dd_dev_info(dd,
+                       "%s: QSFP supply voltage too high\n",
+                       __func__);
+
+       if ((qsfp_interrupt_status[1] & QSFP_LOW_VCC_ALARM) ||
+               (qsfp_interrupt_status[1] & QSFP_LOW_VCC_WARNING))
+               dd_dev_info(dd,
+                       "%s: QSFP supply voltage too low\n",
+                       __func__);
+
+       /* Byte 2 is vendor specific */
+
+       if ((qsfp_interrupt_status[3] & QSFP_HIGH_POWER_ALARM) ||
+               (qsfp_interrupt_status[3] & QSFP_HIGH_POWER_WARNING))
+               dd_dev_info(dd,
+                       "%s: Cable RX channel 1/2 power too high\n",
+                       __func__);
+
+       if ((qsfp_interrupt_status[3] & QSFP_LOW_POWER_ALARM) ||
+               (qsfp_interrupt_status[3] & QSFP_LOW_POWER_WARNING))
+               dd_dev_info(dd,
+                       "%s: Cable RX channel 1/2 power too low\n",
+                       __func__);
+
+       if ((qsfp_interrupt_status[4] & QSFP_HIGH_POWER_ALARM) ||
+               (qsfp_interrupt_status[4] & QSFP_HIGH_POWER_WARNING))
+               dd_dev_info(dd,
+                       "%s: Cable RX channel 3/4 power too high\n",
+                       __func__);
+
+       if ((qsfp_interrupt_status[4] & QSFP_LOW_POWER_ALARM) ||
+               (qsfp_interrupt_status[4] & QSFP_LOW_POWER_WARNING))
+               dd_dev_info(dd,
+                       "%s: Cable RX channel 3/4 power too low\n",
+                       __func__);
+
+       if ((qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_ALARM) ||
+               (qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_WARNING))
+               dd_dev_info(dd,
+                       "%s: Cable TX channel 1/2 bias too high\n",
+                       __func__);
+
+       if ((qsfp_interrupt_status[5] & QSFP_LOW_BIAS_ALARM) ||
+               (qsfp_interrupt_status[5] & QSFP_LOW_BIAS_WARNING))
+               dd_dev_info(dd,
+                       "%s: Cable TX channel 1/2 bias too low\n",
+                       __func__);
+
+       if ((qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_ALARM) ||
+               (qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_WARNING))
+               dd_dev_info(dd,
+                       "%s: Cable TX channel 3/4 bias too high\n",
+                       __func__);
+
+       if ((qsfp_interrupt_status[6] & QSFP_LOW_BIAS_ALARM) ||
+               (qsfp_interrupt_status[6] & QSFP_LOW_BIAS_WARNING))
+               dd_dev_info(dd,
+                       "%s: Cable TX channel 3/4 bias too low\n",
+                       __func__);
+
+       if ((qsfp_interrupt_status[7] & QSFP_HIGH_POWER_ALARM) ||
+               (qsfp_interrupt_status[7] & QSFP_HIGH_POWER_WARNING))
+               dd_dev_info(dd,
+                       "%s: Cable TX channel 1/2 power too high\n",
+                       __func__);
+
+       if ((qsfp_interrupt_status[7] & QSFP_LOW_POWER_ALARM) ||
+               (qsfp_interrupt_status[7] & QSFP_LOW_POWER_WARNING))
+               dd_dev_info(dd,
+                       "%s: Cable TX channel 1/2 power too low\n",
+                       __func__);
+
+       if ((qsfp_interrupt_status[8] & QSFP_HIGH_POWER_ALARM) ||
+               (qsfp_interrupt_status[8] & QSFP_HIGH_POWER_WARNING))
+               dd_dev_info(dd,
+                       "%s: Cable TX channel 3/4 power too high\n",
+                       __func__);
+
+       if ((qsfp_interrupt_status[8] & QSFP_LOW_POWER_ALARM) ||
+               (qsfp_interrupt_status[8] & QSFP_LOW_POWER_WARNING))
+               dd_dev_info(dd,
+                       "%s: Cable TX channel 3/4 power too low\n",
+                       __func__);
+
+       /* Bytes 9-10 and 11-12 are reserved */
+       /* Bytes 13-15 are vendor specific */
+
+       return 0;
+}
+
+static int do_pre_lni_host_behaviors(struct hfi1_pportdata *ppd)
+{
+       refresh_qsfp_cache(ppd, &ppd->qsfp_info);
+
+       return 0;
+}
+
+static int do_qsfp_intr_fallback(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u8 qsfp_interrupt_status = 0;
+
+       if (qsfp_read(ppd, dd->hfi1_id, 2, &qsfp_interrupt_status, 1)
+               != 1) {
+               dd_dev_info(dd,
+                       "%s: Failed to read status of QSFP module\n",
+                       __func__);
+               return -EIO;
+       }
+
+       /* We don't care about alarms & warnings with a non-functional INT_N */
+       if (!(qsfp_interrupt_status & QSFP_DATA_NOT_READY))
+               do_pre_lni_host_behaviors(ppd);
+
+       return 0;
+}
+
+/* This routine will only be scheduled if the QSFP module is present */
+static void qsfp_event(struct work_struct *work)
+{
+       struct qsfp_data *qd;
+       struct hfi1_pportdata *ppd;
+       struct hfi1_devdata *dd;
+
+       qd = container_of(work, struct qsfp_data, qsfp_work);
+       ppd = qd->ppd;
+       dd = ppd->dd;
+
+       /* Sanity check */
+       if (!qsfp_mod_present(ppd))
+               return;
+
+       /*
+        * Turn DC back on after cables has been
+        * re-inserted. Up until now, the DC has been in
+        * reset to save power.
+        */
+       dc_start(dd);
+
+       if (qd->cache_refresh_required) {
+               msleep(3000);
+               reset_qsfp(ppd);
+
+               /* Check for QSFP interrupt after t_init (SFF 8679)
+                * + extra
+                */
+               msleep(3000);
+               if (!qd->qsfp_interrupt_functional) {
+                       if (do_qsfp_intr_fallback(ppd) < 0)
+                               dd_dev_info(dd, "%s: QSFP fallback failed\n",
+                                       __func__);
+                       ppd->driver_link_ready = 1;
+                       start_link(ppd);
+               }
+       }
+
+       if (qd->check_interrupt_flags) {
+               u8 qsfp_interrupt_status[16] = {0,};
+
+               if (qsfp_read(ppd, dd->hfi1_id, 6,
+                             &qsfp_interrupt_status[0], 16) != 16) {
+                       dd_dev_info(dd,
+                               "%s: Failed to read status of QSFP module\n",
+                               __func__);
+               } else {
+                       unsigned long flags;
+                       u8 data_status;
+
+                       spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags);
+                       ppd->qsfp_info.check_interrupt_flags = 0;
+                       spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock,
+                                                               flags);
+
+                       if (qsfp_read(ppd, dd->hfi1_id, 2, &data_status, 1)
+                                != 1) {
+                               dd_dev_info(dd,
+                               "%s: Failed to read status of QSFP module\n",
+                                       __func__);
+                       }
+                       if (!(data_status & QSFP_DATA_NOT_READY)) {
+                               do_pre_lni_host_behaviors(ppd);
+                               start_link(ppd);
+                       } else
+                               handle_qsfp_error_conditions(ppd,
+                                               qsfp_interrupt_status);
+               }
+       }
+}
+
+void init_qsfp(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u64 qsfp_mask;
+
+       if (loopback == LOOPBACK_SERDES || loopback == LOOPBACK_LCB ||
+                       ppd->dd->icode == ICODE_FUNCTIONAL_SIMULATOR) {
+               ppd->driver_link_ready = 1;
+               return;
+       }
+
+       ppd->qsfp_info.ppd = ppd;
+       INIT_WORK(&ppd->qsfp_info.qsfp_work, qsfp_event);
+
+       qsfp_mask = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N);
+       /* Clear current status to avoid spurious interrupts */
+       write_csr(dd,
+                       dd->hfi1_id ?
+                               ASIC_QSFP2_CLEAR :
+                               ASIC_QSFP1_CLEAR,
+               qsfp_mask);
+
+       /* Handle active low nature of INT_N and MODPRST_N pins */
+       if (qsfp_mod_present(ppd))
+               qsfp_mask &= ~(u64)QSFP_HFI0_MODPRST_N;
+       write_csr(dd,
+                 dd->hfi1_id ? ASIC_QSFP2_INVERT : ASIC_QSFP1_INVERT,
+                 qsfp_mask);
+
+       /* Allow only INT_N and MODPRST_N to trigger QSFP interrupts */
+       qsfp_mask |= (u64)QSFP_HFI0_MODPRST_N;
+       write_csr(dd,
+               dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK,
+               qsfp_mask);
+
+       if (qsfp_mod_present(ppd)) {
+               msleep(3000);
+               reset_qsfp(ppd);
+
+               /* Check for QSFP interrupt after t_init (SFF 8679)
+                * + extra
+                */
+               msleep(3000);
+               if (!ppd->qsfp_info.qsfp_interrupt_functional) {
+                       if (do_qsfp_intr_fallback(ppd) < 0)
+                               dd_dev_info(dd,
+                                       "%s: QSFP fallback failed\n",
+                                       __func__);
+                       ppd->driver_link_ready = 1;
+               }
+       }
+}
+
+int bringup_serdes(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u64 guid;
+       int ret;
+
+       if (HFI1_CAP_IS_KSET(EXTENDED_PSN))
+               add_rcvctrl(dd, RCV_CTRL_RCV_EXTENDED_PSN_ENABLE_SMASK);
+
+       guid = ppd->guid;
+       if (!guid) {
+               if (dd->base_guid)
+                       guid = dd->base_guid + ppd->port - 1;
+               ppd->guid = guid;
+       }
+
+       /* the link defaults to enabled */
+       ppd->link_enabled = 1;
+       /* Set linkinit_reason on power up per OPA spec */
+       ppd->linkinit_reason = OPA_LINKINIT_REASON_LINKUP;
+
+       if (loopback) {
+               ret = init_loopback(dd);
+               if (ret < 0)
+                       return ret;
+       }
+
+       return start_link(ppd);
+}
+
+void hfi1_quiet_serdes(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+
+       /*
+        * Shut down the link and keep it down.   First turn off that the
+        * driver wants to allow the link to be up (driver_link_ready).
+        * Then make sure the link is not automatically restarted
+        * (link_enabled).  Cancel any pending restart.  And finally
+        * go offline.
+        */
+       ppd->driver_link_ready = 0;
+       ppd->link_enabled = 0;
+
+       set_link_down_reason(ppd, OPA_LINKDOWN_REASON_SMA_DISABLED, 0,
+         OPA_LINKDOWN_REASON_SMA_DISABLED);
+       set_link_state(ppd, HLS_DN_OFFLINE);
+
+       /* disable the port */
+       clear_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+}
+
+static inline int init_cpu_counters(struct hfi1_devdata *dd)
+{
+       struct hfi1_pportdata *ppd;
+       int i;
+
+       ppd = (struct hfi1_pportdata *)(dd + 1);
+       for (i = 0; i < dd->num_pports; i++, ppd++) {
+               ppd->ibport_data.rc_acks = NULL;
+               ppd->ibport_data.rc_qacks = NULL;
+               ppd->ibport_data.rc_acks = alloc_percpu(u64);
+               ppd->ibport_data.rc_qacks = alloc_percpu(u64);
+               ppd->ibport_data.rc_delayed_comp = alloc_percpu(u64);
+               if ((ppd->ibport_data.rc_acks == NULL) ||
+                   (ppd->ibport_data.rc_delayed_comp == NULL) ||
+                   (ppd->ibport_data.rc_qacks == NULL))
+                       return -ENOMEM;
+       }
+
+       return 0;
+}
+
+static const char * const pt_names[] = {
+       "expected",
+       "eager",
+       "invalid"
+};
+
+static const char *pt_name(u32 type)
+{
+       return type >= ARRAY_SIZE(pt_names) ? "unknown" : pt_names[type];
+}
+
+/*
+ * index is the index into the receive array
+ */
+void hfi1_put_tid(struct hfi1_devdata *dd, u32 index,
+                 u32 type, unsigned long pa, u16 order)
+{
+       u64 reg;
+       void __iomem *base = (dd->rcvarray_wc ? dd->rcvarray_wc :
+                             (dd->kregbase + RCV_ARRAY));
+
+       if (!(dd->flags & HFI1_PRESENT))
+               goto done;
+
+       if (type == PT_INVALID) {
+               pa = 0;
+       } else if (type > PT_INVALID) {
+               dd_dev_err(dd,
+                       "unexpected receive array type %u for index %u, not handled\n",
+                       type, index);
+               goto done;
+       }
+
+       hfi1_cdbg(TID, "type %s, index 0x%x, pa 0x%lx, bsize 0x%lx",
+                 pt_name(type), index, pa, (unsigned long)order);
+
+#define RT_ADDR_SHIFT 12       /* 4KB kernel address boundary */
+       reg = RCV_ARRAY_RT_WRITE_ENABLE_SMASK
+               | (u64)order << RCV_ARRAY_RT_BUF_SIZE_SHIFT
+               | ((pa >> RT_ADDR_SHIFT) & RCV_ARRAY_RT_ADDR_MASK)
+                                       << RCV_ARRAY_RT_ADDR_SHIFT;
+       writeq(reg, base + (index * 8));
+
+       if (type == PT_EAGER)
+               /*
+                * Eager entries are written one-by-one so we have to push them
+                * after we write the entry.
+                */
+               flush_wc();
+done:
+       return;
+}
+
+void hfi1_clear_tids(struct hfi1_ctxtdata *rcd)
+{
+       struct hfi1_devdata *dd = rcd->dd;
+       u32 i;
+
+       /* this could be optimized */
+       for (i = rcd->eager_base; i < rcd->eager_base +
+                    rcd->egrbufs.alloced; i++)
+               hfi1_put_tid(dd, i, PT_INVALID, 0, 0);
+
+       for (i = rcd->expected_base;
+                       i < rcd->expected_base + rcd->expected_count; i++)
+               hfi1_put_tid(dd, i, PT_INVALID, 0, 0);
+}
+
+int hfi1_get_base_kinfo(struct hfi1_ctxtdata *rcd,
+                       struct hfi1_ctxt_info *kinfo)
+{
+       kinfo->runtime_flags = (HFI1_MISC_GET() << HFI1_CAP_USER_SHIFT) |
+               HFI1_CAP_UGET(MASK) | HFI1_CAP_KGET(K2U);
+       return 0;
+}
+
+struct hfi1_message_header *hfi1_get_msgheader(
+                               struct hfi1_devdata *dd, __le32 *rhf_addr)
+{
+       u32 offset = rhf_hdrq_offset(rhf_to_cpu(rhf_addr));
+
+       return (struct hfi1_message_header *)
+               (rhf_addr - dd->rhf_offset + offset);
+}
+
+static const char * const ib_cfg_name_strings[] = {
+       "HFI1_IB_CFG_LIDLMC",
+       "HFI1_IB_CFG_LWID_DG_ENB",
+       "HFI1_IB_CFG_LWID_ENB",
+       "HFI1_IB_CFG_LWID",
+       "HFI1_IB_CFG_SPD_ENB",
+       "HFI1_IB_CFG_SPD",
+       "HFI1_IB_CFG_RXPOL_ENB",
+       "HFI1_IB_CFG_LREV_ENB",
+       "HFI1_IB_CFG_LINKLATENCY",
+       "HFI1_IB_CFG_HRTBT",
+       "HFI1_IB_CFG_OP_VLS",
+       "HFI1_IB_CFG_VL_HIGH_CAP",
+       "HFI1_IB_CFG_VL_LOW_CAP",
+       "HFI1_IB_CFG_OVERRUN_THRESH",
+       "HFI1_IB_CFG_PHYERR_THRESH",
+       "HFI1_IB_CFG_LINKDEFAULT",
+       "HFI1_IB_CFG_PKEYS",
+       "HFI1_IB_CFG_MTU",
+       "HFI1_IB_CFG_LSTATE",
+       "HFI1_IB_CFG_VL_HIGH_LIMIT",
+       "HFI1_IB_CFG_PMA_TICKS",
+       "HFI1_IB_CFG_PORT"
+};
+
+static const char *ib_cfg_name(int which)
+{
+       if (which < 0 || which >= ARRAY_SIZE(ib_cfg_name_strings))
+               return "invalid";
+       return ib_cfg_name_strings[which];
+}
+
+int hfi1_get_ib_cfg(struct hfi1_pportdata *ppd, int which)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       int val = 0;
+
+       switch (which) {
+       case HFI1_IB_CFG_LWID_ENB: /* allowed Link-width */
+               val = ppd->link_width_enabled;
+               break;
+       case HFI1_IB_CFG_LWID: /* currently active Link-width */
+               val = ppd->link_width_active;
+               break;
+       case HFI1_IB_CFG_SPD_ENB: /* allowed Link speeds */
+               val = ppd->link_speed_enabled;
+               break;
+       case HFI1_IB_CFG_SPD: /* current Link speed */
+               val = ppd->link_speed_active;
+               break;
+
+       case HFI1_IB_CFG_RXPOL_ENB: /* Auto-RX-polarity enable */
+       case HFI1_IB_CFG_LREV_ENB: /* Auto-Lane-reversal enable */
+       case HFI1_IB_CFG_LINKLATENCY:
+               goto unimplemented;
+
+       case HFI1_IB_CFG_OP_VLS:
+               val = ppd->vls_operational;
+               break;
+       case HFI1_IB_CFG_VL_HIGH_CAP: /* VL arb high priority table size */
+               val = VL_ARB_HIGH_PRIO_TABLE_SIZE;
+               break;
+       case HFI1_IB_CFG_VL_LOW_CAP: /* VL arb low priority table size */
+               val = VL_ARB_LOW_PRIO_TABLE_SIZE;
+               break;
+       case HFI1_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
+               val = ppd->overrun_threshold;
+               break;
+       case HFI1_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
+               val = ppd->phy_error_threshold;
+               break;
+       case HFI1_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
+               val = dd->link_default;
+               break;
+
+       case HFI1_IB_CFG_HRTBT: /* Heartbeat off/enable/auto */
+       case HFI1_IB_CFG_PMA_TICKS:
+       default:
+unimplemented:
+               if (HFI1_CAP_IS_KSET(PRINT_UNIMPL))
+                       dd_dev_info(
+                               dd,
+                               "%s: which %s: not implemented\n",
+                               __func__,
+                               ib_cfg_name(which));
+               break;
+       }
+
+       return val;
+}
+
+/*
+ * The largest MAD packet size.
+ */
+#define MAX_MAD_PACKET 2048
+
+/*
+ * Return the maximum header bytes that can go on the _wire_
+ * for this device. This count includes the ICRC which is
+ * not part of the packet held in memory but it is appended
+ * by the HW.
+ * This is dependent on the device's receive header entry size.
+ * HFI allows this to be set per-receive context, but the
+ * driver presently enforces a global value.
+ */
+u32 lrh_max_header_bytes(struct hfi1_devdata *dd)
+{
+       /*
+        * The maximum non-payload (MTU) bytes in LRH.PktLen are
+        * the Receive Header Entry Size minus the PBC (or RHF) size
+        * plus one DW for the ICRC appended by HW.
+        *
+        * dd->rcd[0].rcvhdrqentsize is in DW.
+        * We use rcd[0] as all context will have the same value. Also,
+        * the first kernel context would have been allocated by now so
+        * we are guaranteed a valid value.
+        */
+       return (dd->rcd[0]->rcvhdrqentsize - 2/*PBC/RHF*/ + 1/*ICRC*/) << 2;
+}
+
+/*
+ * Set Send Length
+ * @ppd - per port data
+ *
+ * Set the MTU by limiting how many DWs may be sent.  The SendLenCheck*
+ * registers compare against LRH.PktLen, so use the max bytes included
+ * in the LRH.
+ *
+ * This routine changes all VL values except VL15, which it maintains at
+ * the same value.
+ */
+static void set_send_length(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u32 max_hb = lrh_max_header_bytes(dd), maxvlmtu = 0, dcmtu;
+       u64 len1 = 0, len2 = (((dd->vld[15].mtu + max_hb) >> 2)
+                             & SEND_LEN_CHECK1_LEN_VL15_MASK) <<
+               SEND_LEN_CHECK1_LEN_VL15_SHIFT;
+       int i;
+
+       for (i = 0; i < ppd->vls_supported; i++) {
+               if (dd->vld[i].mtu > maxvlmtu)
+                       maxvlmtu = dd->vld[i].mtu;
+               if (i <= 3)
+                       len1 |= (((dd->vld[i].mtu + max_hb) >> 2)
+                                & SEND_LEN_CHECK0_LEN_VL0_MASK) <<
+                               ((i % 4) * SEND_LEN_CHECK0_LEN_VL1_SHIFT);
+               else
+                       len2 |= (((dd->vld[i].mtu + max_hb) >> 2)
+                                & SEND_LEN_CHECK1_LEN_VL4_MASK) <<
+                               ((i % 4) * SEND_LEN_CHECK1_LEN_VL5_SHIFT);
+       }
+       write_csr(dd, SEND_LEN_CHECK0, len1);
+       write_csr(dd, SEND_LEN_CHECK1, len2);
+       /* adjust kernel credit return thresholds based on new MTUs */
+       /* all kernel receive contexts have the same hdrqentsize */
+       for (i = 0; i < ppd->vls_supported; i++) {
+               sc_set_cr_threshold(dd->vld[i].sc,
+                       sc_mtu_to_threshold(dd->vld[i].sc, dd->vld[i].mtu,
+                               dd->rcd[0]->rcvhdrqentsize));
+       }
+       sc_set_cr_threshold(dd->vld[15].sc,
+               sc_mtu_to_threshold(dd->vld[15].sc, dd->vld[15].mtu,
+                       dd->rcd[0]->rcvhdrqentsize));
+
+       /* Adjust maximum MTU for the port in DC */
+       dcmtu = maxvlmtu == 10240 ? DCC_CFG_PORT_MTU_CAP_10240 :
+               (ilog2(maxvlmtu >> 8) + 1);
+       len1 = read_csr(ppd->dd, DCC_CFG_PORT_CONFIG);
+       len1 &= ~DCC_CFG_PORT_CONFIG_MTU_CAP_SMASK;
+       len1 |= ((u64)dcmtu & DCC_CFG_PORT_CONFIG_MTU_CAP_MASK) <<
+               DCC_CFG_PORT_CONFIG_MTU_CAP_SHIFT;
+       write_csr(ppd->dd, DCC_CFG_PORT_CONFIG, len1);
+}
+
+static void set_lidlmc(struct hfi1_pportdata *ppd)
+{
+       int i;
+       u64 sreg = 0;
+       struct hfi1_devdata *dd = ppd->dd;
+       u32 mask = ~((1U << ppd->lmc) - 1);
+       u64 c1 = read_csr(ppd->dd, DCC_CFG_PORT_CONFIG1);
+
+       if (dd->hfi1_snoop.mode_flag)
+               dd_dev_info(dd, "Set lid/lmc while snooping");
+
+       c1 &= ~(DCC_CFG_PORT_CONFIG1_TARGET_DLID_SMASK
+               | DCC_CFG_PORT_CONFIG1_DLID_MASK_SMASK);
+       c1 |= ((ppd->lid & DCC_CFG_PORT_CONFIG1_TARGET_DLID_MASK)
+                       << DCC_CFG_PORT_CONFIG1_TARGET_DLID_SHIFT)|
+             ((mask & DCC_CFG_PORT_CONFIG1_DLID_MASK_MASK)
+                       << DCC_CFG_PORT_CONFIG1_DLID_MASK_SHIFT);
+       write_csr(ppd->dd, DCC_CFG_PORT_CONFIG1, c1);
+
+       /*
+        * Iterate over all the send contexts and set their SLID check
+        */
+       sreg = ((mask & SEND_CTXT_CHECK_SLID_MASK_MASK) <<
+                       SEND_CTXT_CHECK_SLID_MASK_SHIFT) |
+              (((ppd->lid & mask) & SEND_CTXT_CHECK_SLID_VALUE_MASK) <<
+                       SEND_CTXT_CHECK_SLID_VALUE_SHIFT);
+
+       for (i = 0; i < dd->chip_send_contexts; i++) {
+               hfi1_cdbg(LINKVERB, "SendContext[%d].SLID_CHECK = 0x%x",
+                         i, (u32)sreg);
+               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_SLID, sreg);
+       }
+
+       /* Now we have to do the same thing for the sdma engines */
+       sdma_update_lmc(dd, mask, ppd->lid);
+}
+
+static int wait_phy_linkstate(struct hfi1_devdata *dd, u32 state, u32 msecs)
+{
+       unsigned long timeout;
+       u32 curr_state;
+
+       timeout = jiffies + msecs_to_jiffies(msecs);
+       while (1) {
+               curr_state = read_physical_state(dd);
+               if (curr_state == state)
+                       break;
+               if (time_after(jiffies, timeout)) {
+                       dd_dev_err(dd,
+                               "timeout waiting for phy link state 0x%x, current state is 0x%x\n",
+                               state, curr_state);
+                       return -ETIMEDOUT;
+               }
+               usleep_range(1950, 2050); /* sleep 2ms-ish */
+       }
+
+       return 0;
+}
+
+/*
+ * Helper for set_link_state().  Do not call except from that routine.
+ * Expects ppd->hls_mutex to be held.
+ *
+ * @rem_reason value to be sent to the neighbor
+ *
+ * LinkDownReasons only set if transition succeeds.
+ */
+static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u32 pstate, previous_state;
+       u32 last_local_state;
+       u32 last_remote_state;
+       int ret;
+       int do_transition;
+       int do_wait;
+
+       previous_state = ppd->host_link_state;
+       ppd->host_link_state = HLS_GOING_OFFLINE;
+       pstate = read_physical_state(dd);
+       if (pstate == PLS_OFFLINE) {
+               do_transition = 0;      /* in right state */
+               do_wait = 0;            /* ...no need to wait */
+       } else if ((pstate & 0xff) == PLS_OFFLINE) {
+               do_transition = 0;      /* in an offline transient state */
+               do_wait = 1;            /* ...wait for it to settle */
+       } else {
+               do_transition = 1;      /* need to move to offline */
+               do_wait = 1;            /* ...will need to wait */
+       }
+
+       if (do_transition) {
+               ret = set_physical_link_state(dd,
+                       PLS_OFFLINE | (rem_reason << 8));
+
+               if (ret != HCMD_SUCCESS) {
+                       dd_dev_err(dd,
+                               "Failed to transition to Offline link state, return %d\n",
+                               ret);
+                       return -EINVAL;
+               }
+               if (ppd->offline_disabled_reason == OPA_LINKDOWN_REASON_NONE)
+                       ppd->offline_disabled_reason =
+                       OPA_LINKDOWN_REASON_TRANSIENT;
+       }
+
+       if (do_wait) {
+               /* it can take a while for the link to go down */
+               ret = wait_phy_linkstate(dd, PLS_OFFLINE, 10000);
+               if (ret < 0)
+                       return ret;
+       }
+
+       /* make sure the logical state is also down */
+       wait_logical_linkstate(ppd, IB_PORT_DOWN, 1000);
+
+       /*
+        * Now in charge of LCB - must be after the physical state is
+        * offline.quiet and before host_link_state is changed.
+        */
+       set_host_lcb_access(dd);
+       write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */
+       ppd->host_link_state = HLS_LINK_COOLDOWN; /* LCB access allowed */
+
+       /*
+        * The LNI has a mandatory wait time after the physical state
+        * moves to Offline.Quiet.  The wait time may be different
+        * depending on how the link went down.  The 8051 firmware
+        * will observe the needed wait time and only move to ready
+        * when that is completed.  The largest of the quiet timeouts
+        * is 2.5s, so wait that long and then a bit more.
+        */
+       ret = wait_fm_ready(dd, 3000);
+       if (ret) {
+               dd_dev_err(dd,
+                       "After going offline, timed out waiting for the 8051 to become ready to accept host requests\n");
+               /* state is really offline, so make it so */
+               ppd->host_link_state = HLS_DN_OFFLINE;
+               return ret;
+       }
+
+       /*
+        * The state is now offline and the 8051 is ready to accept host
+        * requests.
+        *      - change our state
+        *      - notify others if we were previously in a linkup state
+        */
+       ppd->host_link_state = HLS_DN_OFFLINE;
+       if (previous_state & HLS_UP) {
+               /* went down while link was up */
+               handle_linkup_change(dd, 0);
+       } else if (previous_state
+                       & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) {
+               /* went down while attempting link up */
+               /* byte 1 of last_*_state is the failure reason */
+               read_last_local_state(dd, &last_local_state);
+               read_last_remote_state(dd, &last_remote_state);
+               dd_dev_err(dd,
+                       "LNI failure last states: local 0x%08x, remote 0x%08x\n",
+                       last_local_state, last_remote_state);
+       }
+
+       /* the active link width (downgrade) is 0 on link down */
+       ppd->link_width_active = 0;
+       ppd->link_width_downgrade_tx_active = 0;
+       ppd->link_width_downgrade_rx_active = 0;
+       ppd->current_egress_rate = 0;
+       return 0;
+}
+
+/* return the link state name */
+static const char *link_state_name(u32 state)
+{
+       const char *name;
+       int n = ilog2(state);
+       static const char * const names[] = {
+               [__HLS_UP_INIT_BP]       = "INIT",
+               [__HLS_UP_ARMED_BP]      = "ARMED",
+               [__HLS_UP_ACTIVE_BP]     = "ACTIVE",
+               [__HLS_DN_DOWNDEF_BP]    = "DOWNDEF",
+               [__HLS_DN_POLL_BP]       = "POLL",
+               [__HLS_DN_DISABLE_BP]    = "DISABLE",
+               [__HLS_DN_OFFLINE_BP]    = "OFFLINE",
+               [__HLS_VERIFY_CAP_BP]    = "VERIFY_CAP",
+               [__HLS_GOING_UP_BP]      = "GOING_UP",
+               [__HLS_GOING_OFFLINE_BP] = "GOING_OFFLINE",
+               [__HLS_LINK_COOLDOWN_BP] = "LINK_COOLDOWN"
+       };
+
+       name = n < ARRAY_SIZE(names) ? names[n] : NULL;
+       return name ? name : "unknown";
+}
+
+/* return the link state reason name */
+static const char *link_state_reason_name(struct hfi1_pportdata *ppd, u32 state)
+{
+       if (state == HLS_UP_INIT) {
+               switch (ppd->linkinit_reason) {
+               case OPA_LINKINIT_REASON_LINKUP:
+                       return "(LINKUP)";
+               case OPA_LINKINIT_REASON_FLAPPING:
+                       return "(FLAPPING)";
+               case OPA_LINKINIT_OUTSIDE_POLICY:
+                       return "(OUTSIDE_POLICY)";
+               case OPA_LINKINIT_QUARANTINED:
+                       return "(QUARANTINED)";
+               case OPA_LINKINIT_INSUFIC_CAPABILITY:
+                       return "(INSUFIC_CAPABILITY)";
+               default:
+                       break;
+               }
+       }
+       return "";
+}
+
+/*
+ * driver_physical_state - convert the driver's notion of a port's
+ * state (an HLS_*) into a physical state (a {IB,OPA}_PORTPHYSSTATE_*).
+ * Return -1 (converted to a u32) to indicate error.
+ */
+u32 driver_physical_state(struct hfi1_pportdata *ppd)
+{
+       switch (ppd->host_link_state) {
+       case HLS_UP_INIT:
+       case HLS_UP_ARMED:
+       case HLS_UP_ACTIVE:
+               return IB_PORTPHYSSTATE_LINKUP;
+       case HLS_DN_POLL:
+               return IB_PORTPHYSSTATE_POLLING;
+       case HLS_DN_DISABLE:
+               return IB_PORTPHYSSTATE_DISABLED;
+       case HLS_DN_OFFLINE:
+               return OPA_PORTPHYSSTATE_OFFLINE;
+       case HLS_VERIFY_CAP:
+               return IB_PORTPHYSSTATE_POLLING;
+       case HLS_GOING_UP:
+               return IB_PORTPHYSSTATE_POLLING;
+       case HLS_GOING_OFFLINE:
+               return OPA_PORTPHYSSTATE_OFFLINE;
+       case HLS_LINK_COOLDOWN:
+               return OPA_PORTPHYSSTATE_OFFLINE;
+       case HLS_DN_DOWNDEF:
+       default:
+               dd_dev_err(ppd->dd, "invalid host_link_state 0x%x\n",
+                          ppd->host_link_state);
+               return  -1;
+       }
+}
+
+/*
+ * driver_logical_state - convert the driver's notion of a port's
+ * state (an HLS_*) into a logical state (a IB_PORT_*). Return -1
+ * (converted to a u32) to indicate error.
+ */
+u32 driver_logical_state(struct hfi1_pportdata *ppd)
+{
+       if (ppd->host_link_state && !(ppd->host_link_state & HLS_UP))
+               return IB_PORT_DOWN;
+
+       switch (ppd->host_link_state & HLS_UP) {
+       case HLS_UP_INIT:
+               return IB_PORT_INIT;
+       case HLS_UP_ARMED:
+               return IB_PORT_ARMED;
+       case HLS_UP_ACTIVE:
+               return IB_PORT_ACTIVE;
+       default:
+               dd_dev_err(ppd->dd, "invalid host_link_state 0x%x\n",
+                          ppd->host_link_state);
+       return -1;
+       }
+}
+
+void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason,
+                         u8 neigh_reason, u8 rem_reason)
+{
+       if (ppd->local_link_down_reason.latest == 0 &&
+           ppd->neigh_link_down_reason.latest == 0) {
+               ppd->local_link_down_reason.latest = lcl_reason;
+               ppd->neigh_link_down_reason.latest = neigh_reason;
+               ppd->remote_link_down_reason = rem_reason;
+       }
+}
+
+/*
+ * Change the physical and/or logical link state.
+ *
+ * Do not call this routine while inside an interrupt.  It contains
+ * calls to routines that can take multiple seconds to finish.
+ *
+ * Returns 0 on success, -errno on failure.
+ */
+int set_link_state(struct hfi1_pportdata *ppd, u32 state)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       struct ib_event event = {.device = NULL};
+       int ret1, ret = 0;
+       int was_up, is_down;
+       int orig_new_state, poll_bounce;
+
+       mutex_lock(&ppd->hls_lock);
+
+       orig_new_state = state;
+       if (state == HLS_DN_DOWNDEF)
+               state = dd->link_default;
+
+       /* interpret poll -> poll as a link bounce */
+       poll_bounce = ppd->host_link_state == HLS_DN_POLL
+                               && state == HLS_DN_POLL;
+
+       dd_dev_info(dd, "%s: current %s, new %s %s%s\n", __func__,
+               link_state_name(ppd->host_link_state),
+               link_state_name(orig_new_state),
+               poll_bounce ? "(bounce) " : "",
+               link_state_reason_name(ppd, state));
+
+       was_up = !!(ppd->host_link_state & HLS_UP);
+
+       /*
+        * If we're going to a (HLS_*) link state that implies the logical
+        * link state is neither of (IB_PORT_ARMED, IB_PORT_ACTIVE), then
+        * reset is_sm_config_started to 0.
+        */
+       if (!(state & (HLS_UP_ARMED | HLS_UP_ACTIVE)))
+               ppd->is_sm_config_started = 0;
+
+       /*
+        * Do nothing if the states match.  Let a poll to poll link bounce
+        * go through.
+        */
+       if (ppd->host_link_state == state && !poll_bounce)
+               goto done;
+
+       switch (state) {
+       case HLS_UP_INIT:
+               if (ppd->host_link_state == HLS_DN_POLL && (quick_linkup
+                           || dd->icode == ICODE_FUNCTIONAL_SIMULATOR)) {
+                       /*
+                        * Quick link up jumps from polling to here.
+                        *
+                        * Whether in normal or loopback mode, the
+                        * simulator jumps from polling to link up.
+                        * Accept that here.
+                        */
+                       /* OK */;
+               } else if (ppd->host_link_state != HLS_GOING_UP) {
+                       goto unexpected;
+               }
+
+               ppd->host_link_state = HLS_UP_INIT;
+               ret = wait_logical_linkstate(ppd, IB_PORT_INIT, 1000);
+               if (ret) {
+                       /* logical state didn't change, stay at going_up */
+                       ppd->host_link_state = HLS_GOING_UP;
+                       dd_dev_err(dd,
+                               "%s: logical state did not change to INIT\n",
+                               __func__);
+               } else {
+                       /* clear old transient LINKINIT_REASON code */
+                       if (ppd->linkinit_reason >= OPA_LINKINIT_REASON_CLEAR)
+                               ppd->linkinit_reason =
+                                       OPA_LINKINIT_REASON_LINKUP;
+
+                       /* enable the port */
+                       add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+
+                       handle_linkup_change(dd, 1);
+               }
+               break;
+       case HLS_UP_ARMED:
+               if (ppd->host_link_state != HLS_UP_INIT)
+                       goto unexpected;
+
+               ppd->host_link_state = HLS_UP_ARMED;
+               set_logical_state(dd, LSTATE_ARMED);
+               ret = wait_logical_linkstate(ppd, IB_PORT_ARMED, 1000);
+               if (ret) {
+                       /* logical state didn't change, stay at init */
+                       ppd->host_link_state = HLS_UP_INIT;
+                       dd_dev_err(dd,
+                               "%s: logical state did not change to ARMED\n",
+                               __func__);
+               }
+               /*
+                * The simulator does not currently implement SMA messages,
+                * so neighbor_normal is not set.  Set it here when we first
+                * move to Armed.
+                */
+               if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR)
+                       ppd->neighbor_normal = 1;
+               break;
+       case HLS_UP_ACTIVE:
+               if (ppd->host_link_state != HLS_UP_ARMED)
+                       goto unexpected;
+
+               ppd->host_link_state = HLS_UP_ACTIVE;
+               set_logical_state(dd, LSTATE_ACTIVE);
+               ret = wait_logical_linkstate(ppd, IB_PORT_ACTIVE, 1000);
+               if (ret) {
+                       /* logical state didn't change, stay at armed */
+                       ppd->host_link_state = HLS_UP_ARMED;
+                       dd_dev_err(dd,
+                               "%s: logical state did not change to ACTIVE\n",
+                               __func__);
+               } else {
+
+                       /* tell all engines to go running */
+                       sdma_all_running(dd);
+
+                       /* Signal the IB layer that the port has went active */
+                       event.device = &dd->verbs_dev.ibdev;
+                       event.element.port_num = ppd->port;
+                       event.event = IB_EVENT_PORT_ACTIVE;
+               }
+               break;
+       case HLS_DN_POLL:
+               if ((ppd->host_link_state == HLS_DN_DISABLE ||
+                    ppd->host_link_state == HLS_DN_OFFLINE) &&
+                   dd->dc_shutdown)
+                       dc_start(dd);
+               /* Hand LED control to the DC */
+               write_csr(dd, DCC_CFG_LED_CNTRL, 0);
+
+               if (ppd->host_link_state != HLS_DN_OFFLINE) {
+                       u8 tmp = ppd->link_enabled;
+
+                       ret = goto_offline(ppd, ppd->remote_link_down_reason);
+                       if (ret) {
+                               ppd->link_enabled = tmp;
+                               break;
+                       }
+                       ppd->remote_link_down_reason = 0;
+
+                       if (ppd->driver_link_ready)
+                               ppd->link_enabled = 1;
+               }
+
+               ret = set_local_link_attributes(ppd);
+               if (ret)
+                       break;
+
+               ppd->port_error_action = 0;
+               ppd->host_link_state = HLS_DN_POLL;
+
+               if (quick_linkup) {
+                       /* quick linkup does not go into polling */
+                       ret = do_quick_linkup(dd);
+               } else {
+                       ret1 = set_physical_link_state(dd, PLS_POLLING);
+                       if (ret1 != HCMD_SUCCESS) {
+                               dd_dev_err(dd,
+                                       "Failed to transition to Polling link state, return 0x%x\n",
+                                       ret1);
+                               ret = -EINVAL;
+                       }
+               }
+               ppd->offline_disabled_reason = OPA_LINKDOWN_REASON_NONE;
+               /*
+                * If an error occurred above, go back to offline.  The
+                * caller may reschedule another attempt.
+                */
+               if (ret)
+                       goto_offline(ppd, 0);
+               break;
+       case HLS_DN_DISABLE:
+               /* link is disabled */
+               ppd->link_enabled = 0;
+
+               /* allow any state to transition to disabled */
+
+               /* must transition to offline first */
+               if (ppd->host_link_state != HLS_DN_OFFLINE) {
+                       ret = goto_offline(ppd, ppd->remote_link_down_reason);
+                       if (ret)
+                               break;
+                       ppd->remote_link_down_reason = 0;
+               }
+
+               ret1 = set_physical_link_state(dd, PLS_DISABLED);
+               if (ret1 != HCMD_SUCCESS) {
+                       dd_dev_err(dd,
+                               "Failed to transition to Disabled link state, return 0x%x\n",
+                               ret1);
+                       ret = -EINVAL;
+                       break;
+               }
+               ppd->host_link_state = HLS_DN_DISABLE;
+               dc_shutdown(dd);
+               break;
+       case HLS_DN_OFFLINE:
+               if (ppd->host_link_state == HLS_DN_DISABLE)
+                       dc_start(dd);
+
+               /* allow any state to transition to offline */
+               ret = goto_offline(ppd, ppd->remote_link_down_reason);
+               if (!ret)
+                       ppd->remote_link_down_reason = 0;
+               break;
+       case HLS_VERIFY_CAP:
+               if (ppd->host_link_state != HLS_DN_POLL)
+                       goto unexpected;
+               ppd->host_link_state = HLS_VERIFY_CAP;
+               break;
+       case HLS_GOING_UP:
+               if (ppd->host_link_state != HLS_VERIFY_CAP)
+                       goto unexpected;
+
+               ret1 = set_physical_link_state(dd, PLS_LINKUP);
+               if (ret1 != HCMD_SUCCESS) {
+                       dd_dev_err(dd,
+                               "Failed to transition to link up state, return 0x%x\n",
+                               ret1);
+                       ret = -EINVAL;
+                       break;
+               }
+               ppd->host_link_state = HLS_GOING_UP;
+               break;
+
+       case HLS_GOING_OFFLINE:         /* transient within goto_offline() */
+       case HLS_LINK_COOLDOWN:         /* transient within goto_offline() */
+       default:
+               dd_dev_info(dd, "%s: state 0x%x: not supported\n",
+                       __func__, state);
+               ret = -EINVAL;
+               break;
+       }
+
+       is_down = !!(ppd->host_link_state & (HLS_DN_POLL |
+                       HLS_DN_DISABLE | HLS_DN_OFFLINE));
+
+       if (was_up && is_down && ppd->local_link_down_reason.sma == 0 &&
+           ppd->neigh_link_down_reason.sma == 0) {
+               ppd->local_link_down_reason.sma =
+                 ppd->local_link_down_reason.latest;
+               ppd->neigh_link_down_reason.sma =
+                 ppd->neigh_link_down_reason.latest;
+       }
+
+       goto done;
+
+unexpected:
+       dd_dev_err(dd, "%s: unexpected state transition from %s to %s\n",
+               __func__, link_state_name(ppd->host_link_state),
+               link_state_name(state));
+       ret = -EINVAL;
+
+done:
+       mutex_unlock(&ppd->hls_lock);
+
+       if (event.device)
+               ib_dispatch_event(&event);
+
+       return ret;
+}
+
+int hfi1_set_ib_cfg(struct hfi1_pportdata *ppd, int which, u32 val)
+{
+       u64 reg;
+       int ret = 0;
+
+       switch (which) {
+       case HFI1_IB_CFG_LIDLMC:
+               set_lidlmc(ppd);
+               break;
+       case HFI1_IB_CFG_VL_HIGH_LIMIT:
+               /*
+                * The VL Arbitrator high limit is sent in units of 4k
+                * bytes, while HFI stores it in units of 64 bytes.
+                */
+               val *= 4096/64;
+               reg = ((u64)val & SEND_HIGH_PRIORITY_LIMIT_LIMIT_MASK)
+                       << SEND_HIGH_PRIORITY_LIMIT_LIMIT_SHIFT;
+               write_csr(ppd->dd, SEND_HIGH_PRIORITY_LIMIT, reg);
+               break;
+       case HFI1_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
+               /* HFI only supports POLL as the default link down state */
+               if (val != HLS_DN_POLL)
+                       ret = -EINVAL;
+               break;
+       case HFI1_IB_CFG_OP_VLS:
+               if (ppd->vls_operational != val) {
+                       ppd->vls_operational = val;
+                       if (!ppd->port)
+                               ret = -EINVAL;
+                       else
+                               ret = sdma_map_init(
+                                       ppd->dd,
+                                       ppd->port - 1,
+                                       val,
+                                       NULL);
+               }
+               break;
+       /*
+        * For link width, link width downgrade, and speed enable, always AND
+        * the setting with what is actually supported.  This has two benefits.
+        * First, enabled can't have unsupported values, no matter what the
+        * SM or FM might want.  Second, the ALL_SUPPORTED wildcards that mean
+        * "fill in with your supported value" have all the bits in the
+        * field set, so simply ANDing with supported has the desired result.
+        */
+       case HFI1_IB_CFG_LWID_ENB: /* set allowed Link-width */
+               ppd->link_width_enabled = val & ppd->link_width_supported;
+               break;
+       case HFI1_IB_CFG_LWID_DG_ENB: /* set allowed link width downgrade */
+               ppd->link_width_downgrade_enabled =
+                               val & ppd->link_width_downgrade_supported;
+               break;
+       case HFI1_IB_CFG_SPD_ENB: /* allowed Link speeds */
+               ppd->link_speed_enabled = val & ppd->link_speed_supported;
+               break;
+       case HFI1_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
+               /*
+                * HFI does not follow IB specs, save this value
+                * so we can report it, if asked.
+                */
+               ppd->overrun_threshold = val;
+               break;
+       case HFI1_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
+               /*
+                * HFI does not follow IB specs, save this value
+                * so we can report it, if asked.
+                */
+               ppd->phy_error_threshold = val;
+               break;
+
+       case HFI1_IB_CFG_MTU:
+               set_send_length(ppd);
+               break;
+
+       case HFI1_IB_CFG_PKEYS:
+               if (HFI1_CAP_IS_KSET(PKEY_CHECK))
+                       set_partition_keys(ppd);
+               break;
+
+       default:
+               if (HFI1_CAP_IS_KSET(PRINT_UNIMPL))
+                       dd_dev_info(ppd->dd,
+                         "%s: which %s, val 0x%x: not implemented\n",
+                         __func__, ib_cfg_name(which), val);
+               break;
+       }
+       return ret;
+}
+
+/* begin functions related to vl arbitration table caching */
+static void init_vl_arb_caches(struct hfi1_pportdata *ppd)
+{
+       int i;
+
+       BUILD_BUG_ON(VL_ARB_TABLE_SIZE !=
+                       VL_ARB_LOW_PRIO_TABLE_SIZE);
+       BUILD_BUG_ON(VL_ARB_TABLE_SIZE !=
+                       VL_ARB_HIGH_PRIO_TABLE_SIZE);
+
+       /*
+        * Note that we always return values directly from the
+        * 'vl_arb_cache' (and do no CSR reads) in response to a
+        * 'Get(VLArbTable)'. This is obviously correct after a
+        * 'Set(VLArbTable)', since the cache will then be up to
+        * date. But it's also correct prior to any 'Set(VLArbTable)'
+        * since then both the cache, and the relevant h/w registers
+        * will be zeroed.
+        */
+
+       for (i = 0; i < MAX_PRIO_TABLE; i++)
+               spin_lock_init(&ppd->vl_arb_cache[i].lock);
+}
+
+/*
+ * vl_arb_lock_cache
+ *
+ * All other vl_arb_* functions should be called only after locking
+ * the cache.
+ */
+static inline struct vl_arb_cache *
+vl_arb_lock_cache(struct hfi1_pportdata *ppd, int idx)
+{
+       if (idx != LO_PRIO_TABLE && idx != HI_PRIO_TABLE)
+               return NULL;
+       spin_lock(&ppd->vl_arb_cache[idx].lock);
+       return &ppd->vl_arb_cache[idx];
+}
+
+static inline void vl_arb_unlock_cache(struct hfi1_pportdata *ppd, int idx)
+{
+       spin_unlock(&ppd->vl_arb_cache[idx].lock);
+}
+
+static void vl_arb_get_cache(struct vl_arb_cache *cache,
+                            struct ib_vl_weight_elem *vl)
+{
+       memcpy(vl, cache->table, VL_ARB_TABLE_SIZE * sizeof(*vl));
+}
+
+static void vl_arb_set_cache(struct vl_arb_cache *cache,
+                            struct ib_vl_weight_elem *vl)
+{
+       memcpy(cache->table, vl, VL_ARB_TABLE_SIZE * sizeof(*vl));
+}
+
+static int vl_arb_match_cache(struct vl_arb_cache *cache,
+                             struct ib_vl_weight_elem *vl)
+{
+       return !memcmp(cache->table, vl, VL_ARB_TABLE_SIZE * sizeof(*vl));
+}
+/* end functions related to vl arbitration table caching */
+
+static int set_vl_weights(struct hfi1_pportdata *ppd, u32 target,
+                         u32 size, struct ib_vl_weight_elem *vl)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u64 reg;
+       unsigned int i, is_up = 0;
+       int drain, ret = 0;
+
+       mutex_lock(&ppd->hls_lock);
+
+       if (ppd->host_link_state & HLS_UP)
+               is_up = 1;
+
+       drain = !is_ax(dd) && is_up;
+
+       if (drain)
+               /*
+                * Before adjusting VL arbitration weights, empty per-VL
+                * FIFOs, otherwise a packet whose VL weight is being
+                * set to 0 could get stuck in a FIFO with no chance to
+                * egress.
+                */
+               ret = stop_drain_data_vls(dd);
+
+       if (ret) {
+               dd_dev_err(
+                       dd,
+                       "%s: cannot stop/drain VLs - refusing to change VL arbitration weights\n",
+                       __func__);
+               goto err;
+       }
+
+       for (i = 0; i < size; i++, vl++) {
+               /*
+                * NOTE: The low priority shift and mask are used here, but
+                * they are the same for both the low and high registers.
+                */
+               reg = (((u64)vl->vl & SEND_LOW_PRIORITY_LIST_VL_MASK)
+                               << SEND_LOW_PRIORITY_LIST_VL_SHIFT)
+                     | (((u64)vl->weight
+                               & SEND_LOW_PRIORITY_LIST_WEIGHT_MASK)
+                               << SEND_LOW_PRIORITY_LIST_WEIGHT_SHIFT);
+               write_csr(dd, target + (i * 8), reg);
+       }
+       pio_send_control(dd, PSC_GLOBAL_VLARB_ENABLE);
+
+       if (drain)
+               open_fill_data_vls(dd); /* reopen all VLs */
+
+err:
+       mutex_unlock(&ppd->hls_lock);
+
+       return ret;
+}
+
+/*
+ * Read one credit merge VL register.
+ */
+static void read_one_cm_vl(struct hfi1_devdata *dd, u32 csr,
+                          struct vl_limit *vll)
+{
+       u64 reg = read_csr(dd, csr);
+
+       vll->dedicated = cpu_to_be16(
+               (reg >> SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT)
+               & SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_MASK);
+       vll->shared = cpu_to_be16(
+               (reg >> SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT)
+               & SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_MASK);
+}
+
+/*
+ * Read the current credit merge limits.
+ */
+static int get_buffer_control(struct hfi1_devdata *dd,
+                             struct buffer_control *bc, u16 *overall_limit)
+{
+       u64 reg;
+       int i;
+
+       /* not all entries are filled in */
+       memset(bc, 0, sizeof(*bc));
+
+       /* OPA and HFI have a 1-1 mapping */
+       for (i = 0; i < TXE_NUM_DATA_VL; i++)
+               read_one_cm_vl(dd, SEND_CM_CREDIT_VL + (8*i), &bc->vl[i]);
+
+       /* NOTE: assumes that VL* and VL15 CSRs are bit-wise identical */
+       read_one_cm_vl(dd, SEND_CM_CREDIT_VL15, &bc->vl[15]);
+
+       reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT);
+       bc->overall_shared_limit = cpu_to_be16(
+               (reg >> SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT)
+               & SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_MASK);
+       if (overall_limit)
+               *overall_limit = (reg
+                       >> SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT)
+                       & SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_MASK;
+       return sizeof(struct buffer_control);
+}
+
+static int get_sc2vlnt(struct hfi1_devdata *dd, struct sc2vlnt *dp)
+{
+       u64 reg;
+       int i;
+
+       /* each register contains 16 SC->VLnt mappings, 4 bits each */
+       reg = read_csr(dd, DCC_CFG_SC_VL_TABLE_15_0);
+       for (i = 0; i < sizeof(u64); i++) {
+               u8 byte = *(((u8 *)&reg) + i);
+
+               dp->vlnt[2 * i] = byte & 0xf;
+               dp->vlnt[(2 * i) + 1] = (byte & 0xf0) >> 4;
+       }
+
+       reg = read_csr(dd, DCC_CFG_SC_VL_TABLE_31_16);
+       for (i = 0; i < sizeof(u64); i++) {
+               u8 byte = *(((u8 *)&reg) + i);
+
+               dp->vlnt[16 + (2 * i)] = byte & 0xf;
+               dp->vlnt[16 + (2 * i) + 1] = (byte & 0xf0) >> 4;
+       }
+       return sizeof(struct sc2vlnt);
+}
+
+static void get_vlarb_preempt(struct hfi1_devdata *dd, u32 nelems,
+                             struct ib_vl_weight_elem *vl)
+{
+       unsigned int i;
+
+       for (i = 0; i < nelems; i++, vl++) {
+               vl->vl = 0xf;
+               vl->weight = 0;
+       }
+}
+
+static void set_sc2vlnt(struct hfi1_devdata *dd, struct sc2vlnt *dp)
+{
+       write_csr(dd, DCC_CFG_SC_VL_TABLE_15_0,
+               DC_SC_VL_VAL(15_0,
+               0, dp->vlnt[0] & 0xf,
+               1, dp->vlnt[1] & 0xf,
+               2, dp->vlnt[2] & 0xf,
+               3, dp->vlnt[3] & 0xf,
+               4, dp->vlnt[4] & 0xf,
+               5, dp->vlnt[5] & 0xf,
+               6, dp->vlnt[6] & 0xf,
+               7, dp->vlnt[7] & 0xf,
+               8, dp->vlnt[8] & 0xf,
+               9, dp->vlnt[9] & 0xf,
+               10, dp->vlnt[10] & 0xf,
+               11, dp->vlnt[11] & 0xf,
+               12, dp->vlnt[12] & 0xf,
+               13, dp->vlnt[13] & 0xf,
+               14, dp->vlnt[14] & 0xf,
+               15, dp->vlnt[15] & 0xf));
+       write_csr(dd, DCC_CFG_SC_VL_TABLE_31_16,
+               DC_SC_VL_VAL(31_16,
+               16, dp->vlnt[16] & 0xf,
+               17, dp->vlnt[17] & 0xf,
+               18, dp->vlnt[18] & 0xf,
+               19, dp->vlnt[19] & 0xf,
+               20, dp->vlnt[20] & 0xf,
+               21, dp->vlnt[21] & 0xf,
+               22, dp->vlnt[22] & 0xf,
+               23, dp->vlnt[23] & 0xf,
+               24, dp->vlnt[24] & 0xf,
+               25, dp->vlnt[25] & 0xf,
+               26, dp->vlnt[26] & 0xf,
+               27, dp->vlnt[27] & 0xf,
+               28, dp->vlnt[28] & 0xf,
+               29, dp->vlnt[29] & 0xf,
+               30, dp->vlnt[30] & 0xf,
+               31, dp->vlnt[31] & 0xf));
+}
+
+static void nonzero_msg(struct hfi1_devdata *dd, int idx, const char *what,
+                       u16 limit)
+{
+       if (limit != 0)
+               dd_dev_info(dd, "Invalid %s limit %d on VL %d, ignoring\n",
+                       what, (int)limit, idx);
+}
+
+/* change only the shared limit portion of SendCmGLobalCredit */
+static void set_global_shared(struct hfi1_devdata *dd, u16 limit)
+{
+       u64 reg;
+
+       reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT);
+       reg &= ~SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SMASK;
+       reg |= (u64)limit << SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT;
+       write_csr(dd, SEND_CM_GLOBAL_CREDIT, reg);
+}
+
+/* change only the total credit limit portion of SendCmGLobalCredit */
+static void set_global_limit(struct hfi1_devdata *dd, u16 limit)
+{
+       u64 reg;
+
+       reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT);
+       reg &= ~SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SMASK;
+       reg |= (u64)limit << SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT;
+       write_csr(dd, SEND_CM_GLOBAL_CREDIT, reg);
+}
+
+/* set the given per-VL shared limit */
+static void set_vl_shared(struct hfi1_devdata *dd, int vl, u16 limit)
+{
+       u64 reg;
+       u32 addr;
+
+       if (vl < TXE_NUM_DATA_VL)
+               addr = SEND_CM_CREDIT_VL + (8 * vl);
+       else
+               addr = SEND_CM_CREDIT_VL15;
+
+       reg = read_csr(dd, addr);
+       reg &= ~SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SMASK;
+       reg |= (u64)limit << SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT;
+       write_csr(dd, addr, reg);
+}
+
+/* set the given per-VL dedicated limit */
+static void set_vl_dedicated(struct hfi1_devdata *dd, int vl, u16 limit)
+{
+       u64 reg;
+       u32 addr;
+
+       if (vl < TXE_NUM_DATA_VL)
+               addr = SEND_CM_CREDIT_VL + (8 * vl);
+       else
+               addr = SEND_CM_CREDIT_VL15;
+
+       reg = read_csr(dd, addr);
+       reg &= ~SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SMASK;
+       reg |= (u64)limit << SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT;
+       write_csr(dd, addr, reg);
+}
+
+/* spin until the given per-VL status mask bits clear */
+static void wait_for_vl_status_clear(struct hfi1_devdata *dd, u64 mask,
+                                    const char *which)
+{
+       unsigned long timeout;
+       u64 reg;
+
+       timeout = jiffies + msecs_to_jiffies(VL_STATUS_CLEAR_TIMEOUT);
+       while (1) {
+               reg = read_csr(dd, SEND_CM_CREDIT_USED_STATUS) & mask;
+
+               if (reg == 0)
+                       return; /* success */
+               if (time_after(jiffies, timeout))
+                       break;          /* timed out */
+               udelay(1);
+       }
+
+       dd_dev_err(dd,
+               "%s credit change status not clearing after %dms, mask 0x%llx, not clear 0x%llx\n",
+               which, VL_STATUS_CLEAR_TIMEOUT, mask, reg);
+       /*
+        * If this occurs, it is likely there was a credit loss on the link.
+        * The only recovery from that is a link bounce.
+        */
+       dd_dev_err(dd,
+               "Continuing anyway.  A credit loss may occur.  Suggest a link bounce\n");
+}
+
+/*
+ * The number of credits on the VLs may be changed while everything
+ * is "live", but the following algorithm must be followed due to
+ * how the hardware is actually implemented.  In particular,
+ * Return_Credit_Status[] is the only correct status check.
+ *
+ * if (reducing Global_Shared_Credit_Limit or any shared limit changing)
+ *     set Global_Shared_Credit_Limit = 0
+ *     use_all_vl = 1
+ * mask0 = all VLs that are changing either dedicated or shared limits
+ * set Shared_Limit[mask0] = 0
+ * spin until Return_Credit_Status[use_all_vl ? all VL : mask0] == 0
+ * if (changing any dedicated limit)
+ *     mask1 = all VLs that are lowering dedicated limits
+ *     lower Dedicated_Limit[mask1]
+ *     spin until Return_Credit_Status[mask1] == 0
+ *     raise Dedicated_Limits
+ * raise Shared_Limits
+ * raise Global_Shared_Credit_Limit
+ *
+ * lower = if the new limit is lower, set the limit to the new value
+ * raise = if the new limit is higher than the current value (may be changed
+ *     earlier in the algorithm), set the new limit to the new value
+ */
+static int set_buffer_control(struct hfi1_devdata *dd,
+                             struct buffer_control *new_bc)
+{
+       u64 changing_mask, ld_mask, stat_mask;
+       int change_count;
+       int i, use_all_mask;
+       int this_shared_changing;
+       /*
+        * A0: add the variable any_shared_limit_changing below and in the
+        * algorithm above.  If removing A0 support, it can be removed.
+        */
+       int any_shared_limit_changing;
+       struct buffer_control cur_bc;
+       u8 changing[OPA_MAX_VLS];
+       u8 lowering_dedicated[OPA_MAX_VLS];
+       u16 cur_total;
+       u32 new_total = 0;
+       const u64 all_mask =
+       SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK
+        | SEND_CM_CREDIT_USED_STATUS_VL1_RETURN_CREDIT_STATUS_SMASK
+        | SEND_CM_CREDIT_USED_STATUS_VL2_RETURN_CREDIT_STATUS_SMASK
+        | SEND_CM_CREDIT_USED_STATUS_VL3_RETURN_CREDIT_STATUS_SMASK
+        | SEND_CM_CREDIT_USED_STATUS_VL4_RETURN_CREDIT_STATUS_SMASK
+        | SEND_CM_CREDIT_USED_STATUS_VL5_RETURN_CREDIT_STATUS_SMASK
+        | SEND_CM_CREDIT_USED_STATUS_VL6_RETURN_CREDIT_STATUS_SMASK
+        | SEND_CM_CREDIT_USED_STATUS_VL7_RETURN_CREDIT_STATUS_SMASK
+        | SEND_CM_CREDIT_USED_STATUS_VL15_RETURN_CREDIT_STATUS_SMASK;
+
+#define valid_vl(idx) ((idx) < TXE_NUM_DATA_VL || (idx) == 15)
+#define NUM_USABLE_VLS 16      /* look at VL15 and less */
+
+
+       /* find the new total credits, do sanity check on unused VLs */
+       for (i = 0; i < OPA_MAX_VLS; i++) {
+               if (valid_vl(i)) {
+                       new_total += be16_to_cpu(new_bc->vl[i].dedicated);
+                       continue;
+               }
+               nonzero_msg(dd, i, "dedicated",
+                       be16_to_cpu(new_bc->vl[i].dedicated));
+               nonzero_msg(dd, i, "shared",
+                       be16_to_cpu(new_bc->vl[i].shared));
+               new_bc->vl[i].dedicated = 0;
+               new_bc->vl[i].shared = 0;
+       }
+       new_total += be16_to_cpu(new_bc->overall_shared_limit);
+       if (new_total > (u32)dd->link_credits)
+               return -EINVAL;
+       /* fetch the current values */
+       get_buffer_control(dd, &cur_bc, &cur_total);
+
+       /*
+        * Create the masks we will use.
+        */
+       memset(changing, 0, sizeof(changing));
+       memset(lowering_dedicated, 0, sizeof(lowering_dedicated));
+       /* NOTE: Assumes that the individual VL bits are adjacent and in
+          increasing order */
+       stat_mask =
+               SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK;
+       changing_mask = 0;
+       ld_mask = 0;
+       change_count = 0;
+       any_shared_limit_changing = 0;
+       for (i = 0; i < NUM_USABLE_VLS; i++, stat_mask <<= 1) {
+               if (!valid_vl(i))
+                       continue;
+               this_shared_changing = new_bc->vl[i].shared
+                                               != cur_bc.vl[i].shared;
+               if (this_shared_changing)
+                       any_shared_limit_changing = 1;
+               if (new_bc->vl[i].dedicated != cur_bc.vl[i].dedicated
+                               || this_shared_changing) {
+                       changing[i] = 1;
+                       changing_mask |= stat_mask;
+                       change_count++;
+               }
+               if (be16_to_cpu(new_bc->vl[i].dedicated) <
+                                       be16_to_cpu(cur_bc.vl[i].dedicated)) {
+                       lowering_dedicated[i] = 1;
+                       ld_mask |= stat_mask;
+               }
+       }
+
+       /* bracket the credit change with a total adjustment */
+       if (new_total > cur_total)
+               set_global_limit(dd, new_total);
+
+       /*
+        * Start the credit change algorithm.
+        */
+       use_all_mask = 0;
+       if ((be16_to_cpu(new_bc->overall_shared_limit) <
+                               be16_to_cpu(cur_bc.overall_shared_limit))
+                       || (is_a0(dd) && any_shared_limit_changing)) {
+               set_global_shared(dd, 0);
+               cur_bc.overall_shared_limit = 0;
+               use_all_mask = 1;
+       }
+
+       for (i = 0; i < NUM_USABLE_VLS; i++) {
+               if (!valid_vl(i))
+                       continue;
+
+               if (changing[i]) {
+                       set_vl_shared(dd, i, 0);
+                       cur_bc.vl[i].shared = 0;
+               }
+       }
+
+       wait_for_vl_status_clear(dd, use_all_mask ? all_mask : changing_mask,
+               "shared");
+
+       if (change_count > 0) {
+               for (i = 0; i < NUM_USABLE_VLS; i++) {
+                       if (!valid_vl(i))
+                               continue;
+
+                       if (lowering_dedicated[i]) {
+                               set_vl_dedicated(dd, i,
+                                       be16_to_cpu(new_bc->vl[i].dedicated));
+                               cur_bc.vl[i].dedicated =
+                                               new_bc->vl[i].dedicated;
+                       }
+               }
+
+               wait_for_vl_status_clear(dd, ld_mask, "dedicated");
+
+               /* now raise all dedicated that are going up */
+               for (i = 0; i < NUM_USABLE_VLS; i++) {
+                       if (!valid_vl(i))
+                               continue;
+
+                       if (be16_to_cpu(new_bc->vl[i].dedicated) >
+                                       be16_to_cpu(cur_bc.vl[i].dedicated))
+                               set_vl_dedicated(dd, i,
+                                       be16_to_cpu(new_bc->vl[i].dedicated));
+               }
+       }
+
+       /* next raise all shared that are going up */
+       for (i = 0; i < NUM_USABLE_VLS; i++) {
+               if (!valid_vl(i))
+                       continue;
+
+               if (be16_to_cpu(new_bc->vl[i].shared) >
+                               be16_to_cpu(cur_bc.vl[i].shared))
+                       set_vl_shared(dd, i, be16_to_cpu(new_bc->vl[i].shared));
+       }
+
+       /* finally raise the global shared */
+       if (be16_to_cpu(new_bc->overall_shared_limit) >
+                       be16_to_cpu(cur_bc.overall_shared_limit))
+               set_global_shared(dd,
+                       be16_to_cpu(new_bc->overall_shared_limit));
+
+       /* bracket the credit change with a total adjustment */
+       if (new_total < cur_total)
+               set_global_limit(dd, new_total);
+       return 0;
+}
+
+/*
+ * Read the given fabric manager table. Return the size of the
+ * table (in bytes) on success, and a negative error code on
+ * failure.
+ */
+int fm_get_table(struct hfi1_pportdata *ppd, int which, void *t)
+
+{
+       int size;
+       struct vl_arb_cache *vlc;
+
+       switch (which) {
+       case FM_TBL_VL_HIGH_ARB:
+               size = 256;
+               /*
+                * OPA specifies 128 elements (of 2 bytes each), though
+                * HFI supports only 16 elements in h/w.
+                */
+               vlc = vl_arb_lock_cache(ppd, HI_PRIO_TABLE);
+               vl_arb_get_cache(vlc, t);
+               vl_arb_unlock_cache(ppd, HI_PRIO_TABLE);
+               break;
+       case FM_TBL_VL_LOW_ARB:
+               size = 256;
+               /*
+                * OPA specifies 128 elements (of 2 bytes each), though
+                * HFI supports only 16 elements in h/w.
+                */
+               vlc = vl_arb_lock_cache(ppd, LO_PRIO_TABLE);
+               vl_arb_get_cache(vlc, t);
+               vl_arb_unlock_cache(ppd, LO_PRIO_TABLE);
+               break;
+       case FM_TBL_BUFFER_CONTROL:
+               size = get_buffer_control(ppd->dd, t, NULL);
+               break;
+       case FM_TBL_SC2VLNT:
+               size = get_sc2vlnt(ppd->dd, t);
+               break;
+       case FM_TBL_VL_PREEMPT_ELEMS:
+               size = 256;
+               /* OPA specifies 128 elements, of 2 bytes each */
+               get_vlarb_preempt(ppd->dd, OPA_MAX_VLS, t);
+               break;
+       case FM_TBL_VL_PREEMPT_MATRIX:
+               size = 256;
+               /*
+                * OPA specifies that this is the same size as the VL
+                * arbitration tables (i.e., 256 bytes).
+                */
+               break;
+       default:
+               return -EINVAL;
+       }
+       return size;
+}
+
+/*
+ * Write the given fabric manager table.
+ */
+int fm_set_table(struct hfi1_pportdata *ppd, int which, void *t)
+{
+       int ret = 0;
+       struct vl_arb_cache *vlc;
+
+       switch (which) {
+       case FM_TBL_VL_HIGH_ARB:
+               vlc = vl_arb_lock_cache(ppd, HI_PRIO_TABLE);
+               if (vl_arb_match_cache(vlc, t)) {
+                       vl_arb_unlock_cache(ppd, HI_PRIO_TABLE);
+                       break;
+               }
+               vl_arb_set_cache(vlc, t);
+               vl_arb_unlock_cache(ppd, HI_PRIO_TABLE);
+               ret = set_vl_weights(ppd, SEND_HIGH_PRIORITY_LIST,
+                                    VL_ARB_HIGH_PRIO_TABLE_SIZE, t);
+               break;
+       case FM_TBL_VL_LOW_ARB:
+               vlc = vl_arb_lock_cache(ppd, LO_PRIO_TABLE);
+               if (vl_arb_match_cache(vlc, t)) {
+                       vl_arb_unlock_cache(ppd, LO_PRIO_TABLE);
+                       break;
+               }
+               vl_arb_set_cache(vlc, t);
+               vl_arb_unlock_cache(ppd, LO_PRIO_TABLE);
+               ret = set_vl_weights(ppd, SEND_LOW_PRIORITY_LIST,
+                                    VL_ARB_LOW_PRIO_TABLE_SIZE, t);
+               break;
+       case FM_TBL_BUFFER_CONTROL:
+               ret = set_buffer_control(ppd->dd, t);
+               break;
+       case FM_TBL_SC2VLNT:
+               set_sc2vlnt(ppd->dd, t);
+               break;
+       default:
+               ret = -EINVAL;
+       }
+       return ret;
+}
+
+/*
+ * Disable all data VLs.
+ *
+ * Return 0 if disabled, non-zero if the VLs cannot be disabled.
+ */
+static int disable_data_vls(struct hfi1_devdata *dd)
+{
+       if (is_a0(dd))
+               return 1;
+
+       pio_send_control(dd, PSC_DATA_VL_DISABLE);
+
+       return 0;
+}
+
+/*
+ * open_fill_data_vls() - the counterpart to stop_drain_data_vls().
+ * Just re-enables all data VLs (the "fill" part happens
+ * automatically - the name was chosen for symmetry with
+ * stop_drain_data_vls()).
+ *
+ * Return 0 if successful, non-zero if the VLs cannot be enabled.
+ */
+int open_fill_data_vls(struct hfi1_devdata *dd)
+{
+       if (is_a0(dd))
+               return 1;
+
+       pio_send_control(dd, PSC_DATA_VL_ENABLE);
+
+       return 0;
+}
+
+/*
+ * drain_data_vls() - assumes that disable_data_vls() has been called,
+ * wait for occupancy (of per-VL FIFOs) for all contexts, and SDMA
+ * engines to drop to 0.
+ */
+static void drain_data_vls(struct hfi1_devdata *dd)
+{
+       sc_wait(dd);
+       sdma_wait(dd);
+       pause_for_credit_return(dd);
+}
+
+/*
+ * stop_drain_data_vls() - disable, then drain all per-VL fifos.
+ *
+ * Use open_fill_data_vls() to resume using data VLs.  This pair is
+ * meant to be used like this:
+ *
+ * stop_drain_data_vls(dd);
+ * // do things with per-VL resources
+ * open_fill_data_vls(dd);
+ */
+int stop_drain_data_vls(struct hfi1_devdata *dd)
+{
+       int ret;
+
+       ret = disable_data_vls(dd);
+       if (ret == 0)
+               drain_data_vls(dd);
+
+       return ret;
+}
+
+/*
+ * Convert a nanosecond time to a cclock count.  No matter how slow
+ * the cclock, a non-zero ns will always have a non-zero result.
+ */
+u32 ns_to_cclock(struct hfi1_devdata *dd, u32 ns)
+{
+       u32 cclocks;
+
+       if (dd->icode == ICODE_FPGA_EMULATION)
+               cclocks = (ns * 1000) / FPGA_CCLOCK_PS;
+       else  /* simulation pretends to be ASIC */
+               cclocks = (ns * 1000) / ASIC_CCLOCK_PS;
+       if (ns && !cclocks)     /* if ns nonzero, must be at least 1 */
+               cclocks = 1;
+       return cclocks;
+}
+
+/*
+ * Convert a cclock count to nanoseconds. Not matter how slow
+ * the cclock, a non-zero cclocks will always have a non-zero result.
+ */
+u32 cclock_to_ns(struct hfi1_devdata *dd, u32 cclocks)
+{
+       u32 ns;
+
+       if (dd->icode == ICODE_FPGA_EMULATION)
+               ns = (cclocks * FPGA_CCLOCK_PS) / 1000;
+       else  /* simulation pretends to be ASIC */
+               ns = (cclocks * ASIC_CCLOCK_PS) / 1000;
+       if (cclocks && !ns)
+               ns = 1;
+       return ns;
+}
+
+/*
+ * Dynamically adjust the receive interrupt timeout for a context based on
+ * incoming packet rate.
+ *
+ * NOTE: Dynamic adjustment does not allow rcv_intr_count to be zero.
+ */
+static void adjust_rcv_timeout(struct hfi1_ctxtdata *rcd, u32 npkts)
+{
+       struct hfi1_devdata *dd = rcd->dd;
+       u32 timeout = rcd->rcvavail_timeout;
+
+       /*
+        * This algorithm doubles or halves the timeout depending on whether
+        * the number of packets received in this interrupt were less than or
+        * greater equal the interrupt count.
+        *
+        * The calculations below do not allow a steady state to be achieved.
+        * Only at the endpoints it is possible to have an unchanging
+        * timeout.
+        */
+       if (npkts < rcv_intr_count) {
+               /*
+                * Not enough packets arrived before the timeout, adjust
+                * timeout downward.
+                */
+               if (timeout < 2) /* already at minimum? */
+                       return;
+               timeout >>= 1;
+       } else {
+               /*
+                * More than enough packets arrived before the timeout, adjust
+                * timeout upward.
+                */
+               if (timeout >= dd->rcv_intr_timeout_csr) /* already at max? */
+                       return;
+               timeout = min(timeout << 1, dd->rcv_intr_timeout_csr);
+       }
+
+       rcd->rcvavail_timeout = timeout;
+       /* timeout cannot be larger than rcv_intr_timeout_csr which has already
+          been verified to be in range */
+       write_kctxt_csr(dd, rcd->ctxt, RCV_AVAIL_TIME_OUT,
+               (u64)timeout << RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT);
+}
+
+void update_usrhead(struct hfi1_ctxtdata *rcd, u32 hd, u32 updegr, u32 egrhd,
+                   u32 intr_adjust, u32 npkts)
+{
+       struct hfi1_devdata *dd = rcd->dd;
+       u64 reg;
+       u32 ctxt = rcd->ctxt;
+
+       /*
+        * Need to write timeout register before updating RcvHdrHead to ensure
+        * that a new value is used when the HW decides to restart counting.
+        */
+       if (intr_adjust)
+               adjust_rcv_timeout(rcd, npkts);
+       if (updegr) {
+               reg = (egrhd & RCV_EGR_INDEX_HEAD_HEAD_MASK)
+                       << RCV_EGR_INDEX_HEAD_HEAD_SHIFT;
+               write_uctxt_csr(dd, ctxt, RCV_EGR_INDEX_HEAD, reg);
+       }
+       mmiowb();
+       reg = ((u64)rcv_intr_count << RCV_HDR_HEAD_COUNTER_SHIFT) |
+               (((u64)hd & RCV_HDR_HEAD_HEAD_MASK)
+                       << RCV_HDR_HEAD_HEAD_SHIFT);
+       write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, reg);
+       mmiowb();
+}
+
+u32 hdrqempty(struct hfi1_ctxtdata *rcd)
+{
+       u32 head, tail;
+
+       head = (read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_HEAD)
+               & RCV_HDR_HEAD_HEAD_SMASK) >> RCV_HDR_HEAD_HEAD_SHIFT;
+
+       if (rcd->rcvhdrtail_kvaddr)
+               tail = get_rcvhdrtail(rcd);
+       else
+               tail = read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_TAIL);
+
+       return head == tail;
+}
+
+/*
+ * Context Control and Receive Array encoding for buffer size:
+ *     0x0 invalid
+ *     0x1   4 KB
+ *     0x2   8 KB
+ *     0x3  16 KB
+ *     0x4  32 KB
+ *     0x5  64 KB
+ *     0x6 128 KB
+ *     0x7 256 KB
+ *     0x8 512 KB (Receive Array only)
+ *     0x9   1 MB (Receive Array only)
+ *     0xa   2 MB (Receive Array only)
+ *
+ *     0xB-0xF - reserved (Receive Array only)
+ *
+ *
+ * This routine assumes that the value has already been sanity checked.
+ */
+static u32 encoded_size(u32 size)
+{
+       switch (size) {
+       case   4*1024: return 0x1;
+       case   8*1024: return 0x2;
+       case  16*1024: return 0x3;
+       case  32*1024: return 0x4;
+       case  64*1024: return 0x5;
+       case 128*1024: return 0x6;
+       case 256*1024: return 0x7;
+       case 512*1024: return 0x8;
+       case   1*1024*1024: return 0x9;
+       case   2*1024*1024: return 0xa;
+       }
+       return 0x1;     /* if invalid, go with the minimum size */
+}
+
+void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, int ctxt)
+{
+       struct hfi1_ctxtdata *rcd;
+       u64 rcvctrl, reg;
+       int did_enable = 0;
+
+       rcd = dd->rcd[ctxt];
+       if (!rcd)
+               return;
+
+       hfi1_cdbg(RCVCTRL, "ctxt %d op 0x%x", ctxt, op);
+
+       rcvctrl = read_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL);
+       /* if the context already enabled, don't do the extra steps */
+       if ((op & HFI1_RCVCTRL_CTXT_ENB)
+                       && !(rcvctrl & RCV_CTXT_CTRL_ENABLE_SMASK)) {
+               /* reset the tail and hdr addresses, and sequence count */
+               write_kctxt_csr(dd, ctxt, RCV_HDR_ADDR,
+                               rcd->rcvhdrq_phys);
+               if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL))
+                       write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR,
+                                       rcd->rcvhdrqtailaddr_phys);
+               rcd->seq_cnt = 1;
+
+               /* reset the cached receive header queue head value */
+               rcd->head = 0;
+
+               /*
+                * Zero the receive header queue so we don't get false
+                * positives when checking the sequence number.  The
+                * sequence numbers could land exactly on the same spot.
+                * E.g. a rcd restart before the receive header wrapped.
+                */
+               memset(rcd->rcvhdrq, 0, rcd->rcvhdrq_size);
+
+               /* starting timeout */
+               rcd->rcvavail_timeout = dd->rcv_intr_timeout_csr;
+
+               /* enable the context */
+               rcvctrl |= RCV_CTXT_CTRL_ENABLE_SMASK;
+
+               /* clean the egr buffer size first */
+               rcvctrl &= ~RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK;
+               rcvctrl |= ((u64)encoded_size(rcd->egrbufs.rcvtid_size)
+                               & RCV_CTXT_CTRL_EGR_BUF_SIZE_MASK)
+                                       << RCV_CTXT_CTRL_EGR_BUF_SIZE_SHIFT;
+
+               /* zero RcvHdrHead - set RcvHdrHead.Counter after enable */
+               write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0);
+               did_enable = 1;
+
+               /* zero RcvEgrIndexHead */
+               write_uctxt_csr(dd, ctxt, RCV_EGR_INDEX_HEAD, 0);
+
+               /* set eager count and base index */
+               reg = (((u64)(rcd->egrbufs.alloced >> RCV_SHIFT)
+                       & RCV_EGR_CTRL_EGR_CNT_MASK)
+                      << RCV_EGR_CTRL_EGR_CNT_SHIFT) |
+                       (((rcd->eager_base >> RCV_SHIFT)
+                         & RCV_EGR_CTRL_EGR_BASE_INDEX_MASK)
+                        << RCV_EGR_CTRL_EGR_BASE_INDEX_SHIFT);
+               write_kctxt_csr(dd, ctxt, RCV_EGR_CTRL, reg);
+
+               /*
+                * Set TID (expected) count and base index.
+                * rcd->expected_count is set to individual RcvArray entries,
+                * not pairs, and the CSR takes a pair-count in groups of
+                * four, so divide by 8.
+                */
+               reg = (((rcd->expected_count >> RCV_SHIFT)
+                                       & RCV_TID_CTRL_TID_PAIR_CNT_MASK)
+                               << RCV_TID_CTRL_TID_PAIR_CNT_SHIFT) |
+                     (((rcd->expected_base >> RCV_SHIFT)
+                                       & RCV_TID_CTRL_TID_BASE_INDEX_MASK)
+                               << RCV_TID_CTRL_TID_BASE_INDEX_SHIFT);
+               write_kctxt_csr(dd, ctxt, RCV_TID_CTRL, reg);
+               if (ctxt == VL15CTXT)
+                       write_csr(dd, RCV_VL15, VL15CTXT);
+       }
+       if (op & HFI1_RCVCTRL_CTXT_DIS) {
+               write_csr(dd, RCV_VL15, 0);
+               rcvctrl &= ~RCV_CTXT_CTRL_ENABLE_SMASK;
+       }
+       if (op & HFI1_RCVCTRL_INTRAVAIL_ENB)
+               rcvctrl |= RCV_CTXT_CTRL_INTR_AVAIL_SMASK;
+       if (op & HFI1_RCVCTRL_INTRAVAIL_DIS)
+               rcvctrl &= ~RCV_CTXT_CTRL_INTR_AVAIL_SMASK;
+       if (op & HFI1_RCVCTRL_TAILUPD_ENB && rcd->rcvhdrqtailaddr_phys)
+               rcvctrl |= RCV_CTXT_CTRL_TAIL_UPD_SMASK;
+       if (op & HFI1_RCVCTRL_TAILUPD_DIS)
+               rcvctrl &= ~RCV_CTXT_CTRL_TAIL_UPD_SMASK;
+       if (op & HFI1_RCVCTRL_TIDFLOW_ENB)
+               rcvctrl |= RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK;
+       if (op & HFI1_RCVCTRL_TIDFLOW_DIS)
+               rcvctrl &= ~RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK;
+       if (op & HFI1_RCVCTRL_ONE_PKT_EGR_ENB) {
+               /* In one-packet-per-eager mode, the size comes from
+                  the RcvArray entry. */
+               rcvctrl &= ~RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK;
+               rcvctrl |= RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK;
+       }
+       if (op & HFI1_RCVCTRL_ONE_PKT_EGR_DIS)
+               rcvctrl &= ~RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK;
+       if (op & HFI1_RCVCTRL_NO_RHQ_DROP_ENB)
+               rcvctrl |= RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK;
+       if (op & HFI1_RCVCTRL_NO_RHQ_DROP_DIS)
+               rcvctrl &= ~RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK;
+       if (op & HFI1_RCVCTRL_NO_EGR_DROP_ENB)
+               rcvctrl |= RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK;
+       if (op & HFI1_RCVCTRL_NO_EGR_DROP_DIS)
+               rcvctrl &= ~RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK;
+       rcd->rcvctrl = rcvctrl;
+       hfi1_cdbg(RCVCTRL, "ctxt %d rcvctrl 0x%llx\n", ctxt, rcvctrl);
+       write_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL, rcd->rcvctrl);
+
+       /* work around sticky RcvCtxtStatus.BlockedRHQFull */
+       if (did_enable
+           && (rcvctrl & RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK)) {
+               reg = read_kctxt_csr(dd, ctxt, RCV_CTXT_STATUS);
+               if (reg != 0) {
+                       dd_dev_info(dd, "ctxt %d status %lld (blocked)\n",
+                               ctxt, reg);
+                       read_uctxt_csr(dd, ctxt, RCV_HDR_HEAD);
+                       write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0x10);
+                       write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0x00);
+                       read_uctxt_csr(dd, ctxt, RCV_HDR_HEAD);
+                       reg = read_kctxt_csr(dd, ctxt, RCV_CTXT_STATUS);
+                       dd_dev_info(dd, "ctxt %d status %lld (%s blocked)\n",
+                               ctxt, reg, reg == 0 ? "not" : "still");
+               }
+       }
+
+       if (did_enable) {
+               /*
+                * The interrupt timeout and count must be set after
+                * the context is enabled to take effect.
+                */
+               /* set interrupt timeout */
+               write_kctxt_csr(dd, ctxt, RCV_AVAIL_TIME_OUT,
+                       (u64)rcd->rcvavail_timeout <<
+                               RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT);
+
+               /* set RcvHdrHead.Counter, zero RcvHdrHead.Head (again) */
+               reg = (u64)rcv_intr_count << RCV_HDR_HEAD_COUNTER_SHIFT;
+               write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, reg);
+       }
+
+       if (op & (HFI1_RCVCTRL_TAILUPD_DIS | HFI1_RCVCTRL_CTXT_DIS))
+               /*
+                * If the context has been disabled and the Tail Update has
+                * been cleared, clear the RCV_HDR_TAIL_ADDR CSR so
+                * it doesn't contain an address that is invalid.
+                */
+               write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR, 0);
+}
+
+u32 hfi1_read_cntrs(struct hfi1_devdata *dd, loff_t pos, char **namep,
+                   u64 **cntrp)
+{
+       int ret;
+       u64 val = 0;
+
+       if (namep) {
+               ret = dd->cntrnameslen;
+               if (pos != 0) {
+                       dd_dev_err(dd, "read_cntrs does not support indexing");
+                       return 0;
+               }
+               *namep = dd->cntrnames;
+       } else {
+               const struct cntr_entry *entry;
+               int i, j;
+
+               ret = (dd->ndevcntrs) * sizeof(u64);
+               if (pos != 0) {
+                       dd_dev_err(dd, "read_cntrs does not support indexing");
+                       return 0;
+               }
+
+               /* Get the start of the block of counters */
+               *cntrp = dd->cntrs;
+
+               /*
+                * Now go and fill in each counter in the block.
+                */
+               for (i = 0; i < DEV_CNTR_LAST; i++) {
+                       entry = &dev_cntrs[i];
+                       hfi1_cdbg(CNTR, "reading %s", entry->name);
+                       if (entry->flags & CNTR_DISABLED) {
+                               /* Nothing */
+                               hfi1_cdbg(CNTR, "\tDisabled\n");
+                       } else {
+                               if (entry->flags & CNTR_VL) {
+                                       hfi1_cdbg(CNTR, "\tPer VL\n");
+                                       for (j = 0; j < C_VL_COUNT; j++) {
+                                               val = entry->rw_cntr(entry,
+                                                                 dd, j,
+                                                                 CNTR_MODE_R,
+                                                                 0);
+                                               hfi1_cdbg(
+                                                  CNTR,
+                                                  "\t\tRead 0x%llx for %d\n",
+                                                  val, j);
+                                               dd->cntrs[entry->offset + j] =
+                                                                           val;
+                                       }
+                               } else {
+                                       val = entry->rw_cntr(entry, dd,
+                                                       CNTR_INVALID_VL,
+                                                       CNTR_MODE_R, 0);
+                                       dd->cntrs[entry->offset] = val;
+                                       hfi1_cdbg(CNTR, "\tRead 0x%llx", val);
+                               }
+                       }
+               }
+       }
+       return ret;
+}
+
+/*
+ * Used by sysfs to create files for hfi stats to read
+ */
+u32 hfi1_read_portcntrs(struct hfi1_devdata *dd, loff_t pos, u32 port,
+                       char **namep, u64 **cntrp)
+{
+       int ret;
+       u64 val = 0;
+
+       if (namep) {
+               ret = dd->portcntrnameslen;
+               if (pos != 0) {
+                       dd_dev_err(dd, "index not supported");
+                       return 0;
+               }
+               *namep = dd->portcntrnames;
+       } else {
+               const struct cntr_entry *entry;
+               struct hfi1_pportdata *ppd;
+               int i, j;
+
+               ret = (dd->nportcntrs) * sizeof(u64);
+               if (pos != 0) {
+                       dd_dev_err(dd, "indexing not supported");
+                       return 0;
+               }
+               ppd = (struct hfi1_pportdata *)(dd + 1 + port);
+               *cntrp = ppd->cntrs;
+
+               for (i = 0; i < PORT_CNTR_LAST; i++) {
+                       entry = &port_cntrs[i];
+                       hfi1_cdbg(CNTR, "reading %s", entry->name);
+                       if (entry->flags & CNTR_DISABLED) {
+                               /* Nothing */
+                               hfi1_cdbg(CNTR, "\tDisabled\n");
+                               continue;
+                       }
+
+                       if (entry->flags & CNTR_VL) {
+                               hfi1_cdbg(CNTR, "\tPer VL");
+                               for (j = 0; j < C_VL_COUNT; j++) {
+                                       val = entry->rw_cntr(entry, ppd, j,
+                                                              CNTR_MODE_R,
+                                                              0);
+                                       hfi1_cdbg(
+                                          CNTR,
+                                          "\t\tRead 0x%llx for %d",
+                                          val, j);
+                                       ppd->cntrs[entry->offset + j] = val;
+                               }
+                       } else {
+                               val = entry->rw_cntr(entry, ppd,
+                                                      CNTR_INVALID_VL,
+                                                      CNTR_MODE_R,
+                                                      0);
+                               ppd->cntrs[entry->offset] = val;
+                               hfi1_cdbg(CNTR, "\tRead 0x%llx", val);
+                       }
+               }
+       }
+       return ret;
+}
+
+static void free_cntrs(struct hfi1_devdata *dd)
+{
+       struct hfi1_pportdata *ppd;
+       int i;
+
+       if (dd->synth_stats_timer.data)
+               del_timer_sync(&dd->synth_stats_timer);
+       dd->synth_stats_timer.data = 0;
+       ppd = (struct hfi1_pportdata *)(dd + 1);
+       for (i = 0; i < dd->num_pports; i++, ppd++) {
+               kfree(ppd->cntrs);
+               kfree(ppd->scntrs);
+               free_percpu(ppd->ibport_data.rc_acks);
+               free_percpu(ppd->ibport_data.rc_qacks);
+               free_percpu(ppd->ibport_data.rc_delayed_comp);
+               ppd->cntrs = NULL;
+               ppd->scntrs = NULL;
+               ppd->ibport_data.rc_acks = NULL;
+               ppd->ibport_data.rc_qacks = NULL;
+               ppd->ibport_data.rc_delayed_comp = NULL;
+       }
+       kfree(dd->portcntrnames);
+       dd->portcntrnames = NULL;
+       kfree(dd->cntrs);
+       dd->cntrs = NULL;
+       kfree(dd->scntrs);
+       dd->scntrs = NULL;
+       kfree(dd->cntrnames);
+       dd->cntrnames = NULL;
+}
+
+#define CNTR_MAX 0xFFFFFFFFFFFFFFFFULL
+#define CNTR_32BIT_MAX 0x00000000FFFFFFFF
+
+static u64 read_dev_port_cntr(struct hfi1_devdata *dd, struct cntr_entry *entry,
+                             u64 *psval, void *context, int vl)
+{
+       u64 val;
+       u64 sval = *psval;
+
+       if (entry->flags & CNTR_DISABLED) {
+               dd_dev_err(dd, "Counter %s not enabled", entry->name);
+               return 0;
+       }
+
+       hfi1_cdbg(CNTR, "cntr: %s vl %d psval 0x%llx", entry->name, vl, *psval);
+
+       val = entry->rw_cntr(entry, context, vl, CNTR_MODE_R, 0);
+
+       /* If its a synthetic counter there is more work we need to do */
+       if (entry->flags & CNTR_SYNTH) {
+               if (sval == CNTR_MAX) {
+                       /* No need to read already saturated */
+                       return CNTR_MAX;
+               }
+
+               if (entry->flags & CNTR_32BIT) {
+                       /* 32bit counters can wrap multiple times */
+                       u64 upper = sval >> 32;
+                       u64 lower = (sval << 32) >> 32;
+
+                       if (lower > val) { /* hw wrapped */
+                               if (upper == CNTR_32BIT_MAX)
+                                       val = CNTR_MAX;
+                               else
+                                       upper++;
+                       }
+
+                       if (val != CNTR_MAX)
+                               val = (upper << 32) | val;
+
+               } else {
+                       /* If we rolled we are saturated */
+                       if ((val < sval) || (val > CNTR_MAX))
+                               val = CNTR_MAX;
+               }
+       }
+
+       *psval = val;
+
+       hfi1_cdbg(CNTR, "\tNew val=0x%llx", val);
+
+       return val;
+}
+
+static u64 write_dev_port_cntr(struct hfi1_devdata *dd,
+                              struct cntr_entry *entry,
+                              u64 *psval, void *context, int vl, u64 data)
+{
+       u64 val;
+
+       if (entry->flags & CNTR_DISABLED) {
+               dd_dev_err(dd, "Counter %s not enabled", entry->name);
+               return 0;
+       }
+
+       hfi1_cdbg(CNTR, "cntr: %s vl %d psval 0x%llx", entry->name, vl, *psval);
+
+       if (entry->flags & CNTR_SYNTH) {
+               *psval = data;
+               if (entry->flags & CNTR_32BIT) {
+                       val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W,
+                                            (data << 32) >> 32);
+                       val = data; /* return the full 64bit value */
+               } else {
+                       val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W,
+                                            data);
+               }
+       } else {
+               val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W, data);
+       }
+
+       *psval = val;
+
+       hfi1_cdbg(CNTR, "\tNew val=0x%llx", val);
+
+       return val;
+}
+
+u64 read_dev_cntr(struct hfi1_devdata *dd, int index, int vl)
+{
+       struct cntr_entry *entry;
+       u64 *sval;
+
+       entry = &dev_cntrs[index];
+       sval = dd->scntrs + entry->offset;
+
+       if (vl != CNTR_INVALID_VL)
+               sval += vl;
+
+       return read_dev_port_cntr(dd, entry, sval, dd, vl);
+}
+
+u64 write_dev_cntr(struct hfi1_devdata *dd, int index, int vl, u64 data)
+{
+       struct cntr_entry *entry;
+       u64 *sval;
+
+       entry = &dev_cntrs[index];
+       sval = dd->scntrs + entry->offset;
+
+       if (vl != CNTR_INVALID_VL)
+               sval += vl;
+
+       return write_dev_port_cntr(dd, entry, sval, dd, vl, data);
+}
+
+u64 read_port_cntr(struct hfi1_pportdata *ppd, int index, int vl)
+{
+       struct cntr_entry *entry;
+       u64 *sval;
+
+       entry = &port_cntrs[index];
+       sval = ppd->scntrs + entry->offset;
+
+       if (vl != CNTR_INVALID_VL)
+               sval += vl;
+
+       if ((index >= C_RCV_HDR_OVF_FIRST + ppd->dd->num_rcv_contexts) &&
+           (index <= C_RCV_HDR_OVF_LAST)) {
+               /* We do not want to bother for disabled contexts */
+               return 0;
+       }
+
+       return read_dev_port_cntr(ppd->dd, entry, sval, ppd, vl);
+}
+
+u64 write_port_cntr(struct hfi1_pportdata *ppd, int index, int vl, u64 data)
+{
+       struct cntr_entry *entry;
+       u64 *sval;
+
+       entry = &port_cntrs[index];
+       sval = ppd->scntrs + entry->offset;
+
+       if (vl != CNTR_INVALID_VL)
+               sval += vl;
+
+       if ((index >= C_RCV_HDR_OVF_FIRST + ppd->dd->num_rcv_contexts) &&
+           (index <= C_RCV_HDR_OVF_LAST)) {
+               /* We do not want to bother for disabled contexts */
+               return 0;
+       }
+
+       return write_dev_port_cntr(ppd->dd, entry, sval, ppd, vl, data);
+}
+
+static void update_synth_timer(unsigned long opaque)
+{
+       u64 cur_tx;
+       u64 cur_rx;
+       u64 total_flits;
+       u8 update = 0;
+       int i, j, vl;
+       struct hfi1_pportdata *ppd;
+       struct cntr_entry *entry;
+
+       struct hfi1_devdata *dd = (struct hfi1_devdata *)opaque;
+
+       /*
+        * Rather than keep beating on the CSRs pick a minimal set that we can
+        * check to watch for potential roll over. We can do this by looking at
+        * the number of flits sent/recv. If the total flits exceeds 32bits then
+        * we have to iterate all the counters and update.
+        */
+       entry = &dev_cntrs[C_DC_RCV_FLITS];
+       cur_rx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL, CNTR_MODE_R, 0);
+
+       entry = &dev_cntrs[C_DC_XMIT_FLITS];
+       cur_tx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL, CNTR_MODE_R, 0);
+
+       hfi1_cdbg(
+           CNTR,
+           "[%d] curr tx=0x%llx rx=0x%llx :: last tx=0x%llx rx=0x%llx\n",
+           dd->unit, cur_tx, cur_rx, dd->last_tx, dd->last_rx);
+
+       if ((cur_tx < dd->last_tx) || (cur_rx < dd->last_rx)) {
+               /*
+                * May not be strictly necessary to update but it won't hurt and
+                * simplifies the logic here.
+                */
+               update = 1;
+               hfi1_cdbg(CNTR, "[%d] Tripwire counter rolled, updating",
+                         dd->unit);
+       } else {
+               total_flits = (cur_tx - dd->last_tx) + (cur_rx - dd->last_rx);
+               hfi1_cdbg(CNTR,
+                         "[%d] total flits 0x%llx limit 0x%llx\n", dd->unit,
+                         total_flits, (u64)CNTR_32BIT_MAX);
+               if (total_flits >= CNTR_32BIT_MAX) {
+                       hfi1_cdbg(CNTR, "[%d] 32bit limit hit, updating",
+                                 dd->unit);
+                       update = 1;
+               }
+       }
+
+       if (update) {
+               hfi1_cdbg(CNTR, "[%d] Updating dd and ppd counters", dd->unit);
+               for (i = 0; i < DEV_CNTR_LAST; i++) {
+                       entry = &dev_cntrs[i];
+                       if (entry->flags & CNTR_VL) {
+                               for (vl = 0; vl < C_VL_COUNT; vl++)
+                                       read_dev_cntr(dd, i, vl);
+                       } else {
+                               read_dev_cntr(dd, i, CNTR_INVALID_VL);
+                       }
+               }
+               ppd = (struct hfi1_pportdata *)(dd + 1);
+               for (i = 0; i < dd->num_pports; i++, ppd++) {
+                       for (j = 0; j < PORT_CNTR_LAST; j++) {
+                               entry = &port_cntrs[j];
+                               if (entry->flags & CNTR_VL) {
+                                       for (vl = 0; vl < C_VL_COUNT; vl++)
+                                               read_port_cntr(ppd, j, vl);
+                               } else {
+                                       read_port_cntr(ppd, j, CNTR_INVALID_VL);
+                               }
+                       }
+               }
+
+               /*
+                * We want the value in the register. The goal is to keep track
+                * of the number of "ticks" not the counter value. In other
+                * words if the register rolls we want to notice it and go ahead
+                * and force an update.
+                */
+               entry = &dev_cntrs[C_DC_XMIT_FLITS];
+               dd->last_tx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL,
+                                               CNTR_MODE_R, 0);
+
+               entry = &dev_cntrs[C_DC_RCV_FLITS];
+               dd->last_rx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL,
+                                               CNTR_MODE_R, 0);
+
+               hfi1_cdbg(CNTR, "[%d] setting last tx/rx to 0x%llx 0x%llx",
+                         dd->unit, dd->last_tx, dd->last_rx);
+
+       } else {
+               hfi1_cdbg(CNTR, "[%d] No update necessary", dd->unit);
+       }
+
+mod_timer(&dd->synth_stats_timer, jiffies + HZ * SYNTH_CNT_TIME);
+}
+
+#define C_MAX_NAME 13 /* 12 chars + one for /0 */
+static int init_cntrs(struct hfi1_devdata *dd)
+{
+       int i, rcv_ctxts, index, j;
+       size_t sz;
+       char *p;
+       char name[C_MAX_NAME];
+       struct hfi1_pportdata *ppd;
+
+       /* set up the stats timer; the add_timer is done at the end */
+       setup_timer(&dd->synth_stats_timer, update_synth_timer,
+                   (unsigned long)dd);
+
+       /***********************/
+       /* per device counters */
+       /***********************/
+
+       /* size names and determine how many we have*/
+       dd->ndevcntrs = 0;
+       sz = 0;
+       index = 0;
+
+       for (i = 0; i < DEV_CNTR_LAST; i++) {
+               hfi1_dbg_early("Init cntr %s\n", dev_cntrs[i].name);
+               if (dev_cntrs[i].flags & CNTR_DISABLED) {
+                       hfi1_dbg_early("\tSkipping %s\n", dev_cntrs[i].name);
+                       continue;
+               }
+
+               if (dev_cntrs[i].flags & CNTR_VL) {
+                       hfi1_dbg_early("\tProcessing VL cntr\n");
+                       dev_cntrs[i].offset = index;
+                       for (j = 0; j < C_VL_COUNT; j++) {
+                               memset(name, '\0', C_MAX_NAME);
+                               snprintf(name, C_MAX_NAME, "%s%d",
+                                       dev_cntrs[i].name,
+                                       vl_from_idx(j));
+                               sz += strlen(name);
+                               sz++;
+                               hfi1_dbg_early("\t\t%s\n", name);
+                               dd->ndevcntrs++;
+                               index++;
+                       }
+               } else {
+                       /* +1 for newline  */
+                       sz += strlen(dev_cntrs[i].name) + 1;
+                       dd->ndevcntrs++;
+                       dev_cntrs[i].offset = index;
+                       index++;
+                       hfi1_dbg_early("\tAdding %s\n", dev_cntrs[i].name);
+               }
+       }
+
+       /* allocate space for the counter values */
+       dd->cntrs = kcalloc(index, sizeof(u64), GFP_KERNEL);
+       if (!dd->cntrs)
+               goto bail;
+
+       dd->scntrs = kcalloc(index, sizeof(u64), GFP_KERNEL);
+       if (!dd->scntrs)
+               goto bail;
+
+
+       /* allocate space for the counter names */
+       dd->cntrnameslen = sz;
+       dd->cntrnames = kmalloc(sz, GFP_KERNEL);
+       if (!dd->cntrnames)
+               goto bail;
+
+       /* fill in the names */
+       for (p = dd->cntrnames, i = 0, index = 0; i < DEV_CNTR_LAST; i++) {
+               if (dev_cntrs[i].flags & CNTR_DISABLED) {
+                       /* Nothing */
+               } else {
+                       if (dev_cntrs[i].flags & CNTR_VL) {
+                               for (j = 0; j < C_VL_COUNT; j++) {
+                                       memset(name, '\0', C_MAX_NAME);
+                                       snprintf(name, C_MAX_NAME, "%s%d",
+                                               dev_cntrs[i].name,
+                                               vl_from_idx(j));
+                                       memcpy(p, name, strlen(name));
+                                       p += strlen(name);
+                                       *p++ = '\n';
+                               }
+                       } else {
+                               memcpy(p, dev_cntrs[i].name,
+                                      strlen(dev_cntrs[i].name));
+                               p += strlen(dev_cntrs[i].name);
+                               *p++ = '\n';
+                       }
+                       index++;
+               }
+       }
+
+       /*********************/
+       /* per port counters */
+       /*********************/
+
+       /*
+        * Go through the counters for the overflows and disable the ones we
+        * don't need. This varies based on platform so we need to do it
+        * dynamically here.
+        */
+       rcv_ctxts = dd->num_rcv_contexts;
+       for (i = C_RCV_HDR_OVF_FIRST + rcv_ctxts;
+            i <= C_RCV_HDR_OVF_LAST; i++) {
+               port_cntrs[i].flags |= CNTR_DISABLED;
+       }
+
+       /* size port counter names and determine how many we have*/
+       sz = 0;
+       dd->nportcntrs = 0;
+       for (i = 0; i < PORT_CNTR_LAST; i++) {
+               hfi1_dbg_early("Init pcntr %s\n", port_cntrs[i].name);
+               if (port_cntrs[i].flags & CNTR_DISABLED) {
+                       hfi1_dbg_early("\tSkipping %s\n", port_cntrs[i].name);
+                       continue;
+               }
+
+               if (port_cntrs[i].flags & CNTR_VL) {
+                       hfi1_dbg_early("\tProcessing VL cntr\n");
+                       port_cntrs[i].offset = dd->nportcntrs;
+                       for (j = 0; j < C_VL_COUNT; j++) {
+                               memset(name, '\0', C_MAX_NAME);
+                               snprintf(name, C_MAX_NAME, "%s%d",
+                                       port_cntrs[i].name,
+                                       vl_from_idx(j));
+                               sz += strlen(name);
+                               sz++;
+                               hfi1_dbg_early("\t\t%s\n", name);
+                               dd->nportcntrs++;
+                       }
+               } else {
+                       /* +1 for newline  */
+                       sz += strlen(port_cntrs[i].name) + 1;
+                       port_cntrs[i].offset = dd->nportcntrs;
+                       dd->nportcntrs++;
+                       hfi1_dbg_early("\tAdding %s\n", port_cntrs[i].name);
+               }
+       }
+
+       /* allocate space for the counter names */
+       dd->portcntrnameslen = sz;
+       dd->portcntrnames = kmalloc(sz, GFP_KERNEL);
+       if (!dd->portcntrnames)
+               goto bail;
+
+       /* fill in port cntr names */
+       for (p = dd->portcntrnames, i = 0; i < PORT_CNTR_LAST; i++) {
+               if (port_cntrs[i].flags & CNTR_DISABLED)
+                       continue;
+
+               if (port_cntrs[i].flags & CNTR_VL) {
+                       for (j = 0; j < C_VL_COUNT; j++) {
+                               memset(name, '\0', C_MAX_NAME);
+                               snprintf(name, C_MAX_NAME, "%s%d",
+                                       port_cntrs[i].name,
+                                       vl_from_idx(j));
+                               memcpy(p, name, strlen(name));
+                               p += strlen(name);
+                               *p++ = '\n';
+                       }
+               } else {
+                       memcpy(p, port_cntrs[i].name,
+                              strlen(port_cntrs[i].name));
+                       p += strlen(port_cntrs[i].name);
+                       *p++ = '\n';
+               }
+       }
+
+       /* allocate per port storage for counter values */
+       ppd = (struct hfi1_pportdata *)(dd + 1);
+       for (i = 0; i < dd->num_pports; i++, ppd++) {
+               ppd->cntrs = kcalloc(dd->nportcntrs, sizeof(u64), GFP_KERNEL);
+               if (!ppd->cntrs)
+                       goto bail;
+
+               ppd->scntrs = kcalloc(dd->nportcntrs, sizeof(u64), GFP_KERNEL);
+               if (!ppd->scntrs)
+                       goto bail;
+       }
+
+       /* CPU counters need to be allocated and zeroed */
+       if (init_cpu_counters(dd))
+               goto bail;
+
+       mod_timer(&dd->synth_stats_timer, jiffies + HZ * SYNTH_CNT_TIME);
+       return 0;
+bail:
+       free_cntrs(dd);
+       return -ENOMEM;
+}
+
+
+static u32 chip_to_opa_lstate(struct hfi1_devdata *dd, u32 chip_lstate)
+{
+       switch (chip_lstate) {
+       default:
+               dd_dev_err(dd,
+                        "Unknown logical state 0x%x, reporting IB_PORT_DOWN\n",
+                        chip_lstate);
+               /* fall through */
+       case LSTATE_DOWN:
+               return IB_PORT_DOWN;
+       case LSTATE_INIT:
+               return IB_PORT_INIT;
+       case LSTATE_ARMED:
+               return IB_PORT_ARMED;
+       case LSTATE_ACTIVE:
+               return IB_PORT_ACTIVE;
+       }
+}
+
+u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate)
+{
+       /* look at the HFI meta-states only */
+       switch (chip_pstate & 0xf0) {
+       default:
+               dd_dev_err(dd, "Unexpected chip physical state of 0x%x\n",
+                       chip_pstate);
+               /* fall through */
+       case PLS_DISABLED:
+               return IB_PORTPHYSSTATE_DISABLED;
+       case PLS_OFFLINE:
+               return OPA_PORTPHYSSTATE_OFFLINE;
+       case PLS_POLLING:
+               return IB_PORTPHYSSTATE_POLLING;
+       case PLS_CONFIGPHY:
+               return IB_PORTPHYSSTATE_TRAINING;
+       case PLS_LINKUP:
+               return IB_PORTPHYSSTATE_LINKUP;
+       case PLS_PHYTEST:
+               return IB_PORTPHYSSTATE_PHY_TEST;
+       }
+}
+
+/* return the OPA port logical state name */
+const char *opa_lstate_name(u32 lstate)
+{
+       static const char * const port_logical_names[] = {
+               "PORT_NOP",
+               "PORT_DOWN",
+               "PORT_INIT",
+               "PORT_ARMED",
+               "PORT_ACTIVE",
+               "PORT_ACTIVE_DEFER",
+       };
+       if (lstate < ARRAY_SIZE(port_logical_names))
+               return port_logical_names[lstate];
+       return "unknown";
+}
+
+/* return the OPA port physical state name */
+const char *opa_pstate_name(u32 pstate)
+{
+       static const char * const port_physical_names[] = {
+               "PHYS_NOP",
+               "reserved1",
+               "PHYS_POLL",
+               "PHYS_DISABLED",
+               "PHYS_TRAINING",
+               "PHYS_LINKUP",
+               "PHYS_LINK_ERR_RECOVER",
+               "PHYS_PHY_TEST",
+               "reserved8",
+               "PHYS_OFFLINE",
+               "PHYS_GANGED",
+               "PHYS_TEST",
+       };
+       if (pstate < ARRAY_SIZE(port_physical_names))
+               return port_physical_names[pstate];
+       return "unknown";
+}
+
+/*
+ * Read the hardware link state and set the driver's cached value of it.
+ * Return the (new) current value.
+ */
+u32 get_logical_state(struct hfi1_pportdata *ppd)
+{
+       u32 new_state;
+
+       new_state = chip_to_opa_lstate(ppd->dd, read_logical_state(ppd->dd));
+       if (new_state != ppd->lstate) {
+               dd_dev_info(ppd->dd, "logical state changed to %s (0x%x)\n",
+                       opa_lstate_name(new_state), new_state);
+               ppd->lstate = new_state;
+       }
+       /*
+        * Set port status flags in the page mapped into userspace
+        * memory. Do it here to ensure a reliable state - this is
+        * the only function called by all state handling code.
+        * Always set the flags due to the fact that the cache value
+        * might have been changed explicitly outside of this
+        * function.
+        */
+       if (ppd->statusp) {
+               switch (ppd->lstate) {
+               case IB_PORT_DOWN:
+               case IB_PORT_INIT:
+                       *ppd->statusp &= ~(HFI1_STATUS_IB_CONF |
+                                          HFI1_STATUS_IB_READY);
+                       break;
+               case IB_PORT_ARMED:
+                       *ppd->statusp |= HFI1_STATUS_IB_CONF;
+                       break;
+               case IB_PORT_ACTIVE:
+                       *ppd->statusp |= HFI1_STATUS_IB_READY;
+                       break;
+               }
+       }
+       return ppd->lstate;
+}
+
+/**
+ * wait_logical_linkstate - wait for an IB link state change to occur
+ * @ppd: port device
+ * @state: the state to wait for
+ * @msecs: the number of milliseconds to wait
+ *
+ * Wait up to msecs milliseconds for IB link state change to occur.
+ * For now, take the easy polling route.
+ * Returns 0 if state reached, otherwise -ETIMEDOUT.
+ */
+static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state,
+                                 int msecs)
+{
+       unsigned long timeout;
+
+       timeout = jiffies + msecs_to_jiffies(msecs);
+       while (1) {
+               if (get_logical_state(ppd) == state)
+                       return 0;
+               if (time_after(jiffies, timeout))
+                       break;
+               msleep(20);
+       }
+       dd_dev_err(ppd->dd, "timeout waiting for link state 0x%x\n", state);
+
+       return -ETIMEDOUT;
+}
+
+u8 hfi1_ibphys_portstate(struct hfi1_pportdata *ppd)
+{
+       static u32 remembered_state = 0xff;
+       u32 pstate;
+       u32 ib_pstate;
+
+       pstate = read_physical_state(ppd->dd);
+       ib_pstate = chip_to_opa_pstate(ppd->dd, pstate);
+       if (remembered_state != ib_pstate) {
+               dd_dev_info(ppd->dd,
+                       "%s: physical state changed to %s (0x%x), phy 0x%x\n",
+                       __func__, opa_pstate_name(ib_pstate), ib_pstate,
+                       pstate);
+               remembered_state = ib_pstate;
+       }
+       return ib_pstate;
+}
+
+/*
+ * Read/modify/write ASIC_QSFP register bits as selected by mask
+ * data: 0 or 1 in the positions depending on what needs to be written
+ * dir: 0 for read, 1 for write
+ * mask: select by setting
+ *      I2CCLK  (bit 0)
+ *      I2CDATA (bit 1)
+ */
+u64 hfi1_gpio_mod(struct hfi1_devdata *dd, u32 target, u32 data, u32 dir,
+                 u32 mask)
+{
+       u64 qsfp_oe, target_oe;
+
+       target_oe = target ? ASIC_QSFP2_OE : ASIC_QSFP1_OE;
+       if (mask) {
+               /* We are writing register bits, so lock access */
+               dir &= mask;
+               data &= mask;
+
+               qsfp_oe = read_csr(dd, target_oe);
+               qsfp_oe = (qsfp_oe & ~(u64)mask) | (u64)dir;
+               write_csr(dd, target_oe, qsfp_oe);
+       }
+       /* We are exclusively reading bits here, but it is unlikely
+        * we'll get valid data when we set the direction of the pin
+        * in the same call, so read should call this function again
+        * to get valid data
+        */
+       return read_csr(dd, target ? ASIC_QSFP2_IN : ASIC_QSFP1_IN);
+}
+
+#define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \
+(r &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
+
+#define SET_STATIC_RATE_CONTROL_SMASK(r) \
+(r |= SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
+
+int hfi1_init_ctxt(struct send_context *sc)
+{
+       if (sc != NULL) {
+               struct hfi1_devdata *dd = sc->dd;
+               u64 reg;
+               u8 set = (sc->type == SC_USER ?
+                         HFI1_CAP_IS_USET(STATIC_RATE_CTRL) :
+                         HFI1_CAP_IS_KSET(STATIC_RATE_CTRL));
+               reg = read_kctxt_csr(dd, sc->hw_context,
+                                    SEND_CTXT_CHECK_ENABLE);
+               if (set)
+                       CLEAR_STATIC_RATE_CONTROL_SMASK(reg);
+               else
+                       SET_STATIC_RATE_CONTROL_SMASK(reg);
+               write_kctxt_csr(dd, sc->hw_context,
+                               SEND_CTXT_CHECK_ENABLE, reg);
+       }
+       return 0;
+}
+
+int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp)
+{
+       int ret = 0;
+       u64 reg;
+
+       if (dd->icode != ICODE_RTL_SILICON) {
+               if (HFI1_CAP_IS_KSET(PRINT_UNIMPL))
+                       dd_dev_info(dd, "%s: tempsense not supported by HW\n",
+                                   __func__);
+               return -EINVAL;
+       }
+       reg = read_csr(dd, ASIC_STS_THERM);
+       temp->curr = ((reg >> ASIC_STS_THERM_CURR_TEMP_SHIFT) &
+                     ASIC_STS_THERM_CURR_TEMP_MASK);
+       temp->lo_lim = ((reg >> ASIC_STS_THERM_LO_TEMP_SHIFT) &
+                       ASIC_STS_THERM_LO_TEMP_MASK);
+       temp->hi_lim = ((reg >> ASIC_STS_THERM_HI_TEMP_SHIFT) &
+                       ASIC_STS_THERM_HI_TEMP_MASK);
+       temp->crit_lim = ((reg >> ASIC_STS_THERM_CRIT_TEMP_SHIFT) &
+                         ASIC_STS_THERM_CRIT_TEMP_MASK);
+       /* triggers is a 3-bit value - 1 bit per trigger. */
+       temp->triggers = (u8)((reg >> ASIC_STS_THERM_LOW_SHIFT) & 0x7);
+
+       return ret;
+}
+
+/* ========================================================================= */
+
+/*
+ * Enable/disable chip from delivering interrupts.
+ */
+void set_intr_state(struct hfi1_devdata *dd, u32 enable)
+{
+       int i;
+
+       /*
+        * In HFI, the mask needs to be 1 to allow interrupts.
+        */
+       if (enable) {
+               u64 cce_int_mask;
+               const int qsfp1_int_smask = QSFP1_INT % 64;
+               const int qsfp2_int_smask = QSFP2_INT % 64;
+
+               /* enable all interrupts */
+               for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+                       write_csr(dd, CCE_INT_MASK + (8*i), ~(u64)0);
+
+               /*
+                * disable QSFP1 interrupts for HFI1, QSFP2 interrupts for HFI0
+                * Qsfp1Int and Qsfp2Int are adjacent bits in the same CSR,
+                * therefore just one of QSFP1_INT/QSFP2_INT can be used to find
+                * the index of the appropriate CSR in the CCEIntMask CSR array
+                */
+               cce_int_mask = read_csr(dd, CCE_INT_MASK +
+                                               (8*(QSFP1_INT/64)));
+               if (dd->hfi1_id) {
+                       cce_int_mask &= ~((u64)1 << qsfp1_int_smask);
+                       write_csr(dd, CCE_INT_MASK + (8*(QSFP1_INT/64)),
+                                       cce_int_mask);
+               } else {
+                       cce_int_mask &= ~((u64)1 << qsfp2_int_smask);
+                       write_csr(dd, CCE_INT_MASK + (8*(QSFP2_INT/64)),
+                                       cce_int_mask);
+               }
+       } else {
+               for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+                       write_csr(dd, CCE_INT_MASK + (8*i), 0ull);
+       }
+}
+
+/*
+ * Clear all interrupt sources on the chip.
+ */
+static void clear_all_interrupts(struct hfi1_devdata *dd)
+{
+       int i;
+
+       for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+               write_csr(dd, CCE_INT_CLEAR + (8*i), ~(u64)0);
+
+       write_csr(dd, CCE_ERR_CLEAR, ~(u64)0);
+       write_csr(dd, MISC_ERR_CLEAR, ~(u64)0);
+       write_csr(dd, RCV_ERR_CLEAR, ~(u64)0);
+       write_csr(dd, SEND_ERR_CLEAR, ~(u64)0);
+       write_csr(dd, SEND_PIO_ERR_CLEAR, ~(u64)0);
+       write_csr(dd, SEND_DMA_ERR_CLEAR, ~(u64)0);
+       write_csr(dd, SEND_EGRESS_ERR_CLEAR, ~(u64)0);
+       for (i = 0; i < dd->chip_send_contexts; i++)
+               write_kctxt_csr(dd, i, SEND_CTXT_ERR_CLEAR, ~(u64)0);
+       for (i = 0; i < dd->chip_sdma_engines; i++)
+               write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_CLEAR, ~(u64)0);
+
+       write_csr(dd, DCC_ERR_FLG_CLR, ~(u64)0);
+       write_csr(dd, DC_LCB_ERR_CLR, ~(u64)0);
+       write_csr(dd, DC_DC8051_ERR_CLR, ~(u64)0);
+}
+
+/* Move to pcie.c? */
+static void disable_intx(struct pci_dev *pdev)
+{
+       pci_intx(pdev, 0);
+}
+
+static void clean_up_interrupts(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /* remove irqs - must happen before disabling/turning off */
+       if (dd->num_msix_entries) {
+               /* MSI-X */
+               struct hfi1_msix_entry *me = dd->msix_entries;
+
+               for (i = 0; i < dd->num_msix_entries; i++, me++) {
+                       if (me->arg == NULL) /* => no irq, no affinity */
+                               break;
+                       irq_set_affinity_hint(dd->msix_entries[i].msix.vector,
+                                       NULL);
+                       free_irq(me->msix.vector, me->arg);
+               }
+       } else {
+               /* INTx */
+               if (dd->requested_intx_irq) {
+                       free_irq(dd->pcidev->irq, dd);
+                       dd->requested_intx_irq = 0;
+               }
+       }
+
+       /* turn off interrupts */
+       if (dd->num_msix_entries) {
+               /* MSI-X */
+               hfi1_nomsix(dd);
+       } else {
+               /* INTx */
+               disable_intx(dd->pcidev);
+       }
+
+       /* clean structures */
+       for (i = 0; i < dd->num_msix_entries; i++)
+               free_cpumask_var(dd->msix_entries[i].mask);
+       kfree(dd->msix_entries);
+       dd->msix_entries = NULL;
+       dd->num_msix_entries = 0;
+}
+
+/*
+ * Remap the interrupt source from the general handler to the given MSI-X
+ * interrupt.
+ */
+static void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr)
+{
+       u64 reg;
+       int m, n;
+
+       /* clear from the handled mask of the general interrupt */
+       m = isrc / 64;
+       n = isrc % 64;
+       dd->gi_mask[m] &= ~((u64)1 << n);
+
+       /* direct the chip source to the given MSI-X interrupt */
+       m = isrc / 8;
+       n = isrc % 8;
+       reg = read_csr(dd, CCE_INT_MAP + (8*m));
+       reg &= ~((u64)0xff << (8*n));
+       reg |= ((u64)msix_intr & 0xff) << (8*n);
+       write_csr(dd, CCE_INT_MAP + (8*m), reg);
+}
+
+static void remap_sdma_interrupts(struct hfi1_devdata *dd,
+                                 int engine, int msix_intr)
+{
+       /*
+        * SDMA engine interrupt sources grouped by type, rather than
+        * engine.  Per-engine interrupts are as follows:
+        *      SDMA
+        *      SDMAProgress
+        *      SDMAIdle
+        */
+       remap_intr(dd, IS_SDMA_START + 0*TXE_NUM_SDMA_ENGINES + engine,
+               msix_intr);
+       remap_intr(dd, IS_SDMA_START + 1*TXE_NUM_SDMA_ENGINES + engine,
+               msix_intr);
+       remap_intr(dd, IS_SDMA_START + 2*TXE_NUM_SDMA_ENGINES + engine,
+               msix_intr);
+}
+
+static void remap_receive_available_interrupt(struct hfi1_devdata *dd,
+                                             int rx, int msix_intr)
+{
+       remap_intr(dd, IS_RCVAVAIL_START + rx, msix_intr);
+}
+
+static int request_intx_irq(struct hfi1_devdata *dd)
+{
+       int ret;
+
+       snprintf(dd->intx_name, sizeof(dd->intx_name), DRIVER_NAME"_%d",
+               dd->unit);
+       ret = request_irq(dd->pcidev->irq, general_interrupt,
+                                 IRQF_SHARED, dd->intx_name, dd);
+       if (ret)
+               dd_dev_err(dd, "unable to request INTx interrupt, err %d\n",
+                               ret);
+       else
+               dd->requested_intx_irq = 1;
+       return ret;
+}
+
+static int request_msix_irqs(struct hfi1_devdata *dd)
+{
+       const struct cpumask *local_mask;
+       cpumask_var_t def, rcv;
+       bool def_ret, rcv_ret;
+       int first_general, last_general;
+       int first_sdma, last_sdma;
+       int first_rx, last_rx;
+       int first_cpu, restart_cpu, curr_cpu;
+       int rcv_cpu, sdma_cpu;
+       int i, ret = 0, possible;
+       int ht;
+
+       /* calculate the ranges we are going to use */
+       first_general = 0;
+       first_sdma = last_general = first_general + 1;
+       first_rx = last_sdma = first_sdma + dd->num_sdma;
+       last_rx = first_rx + dd->n_krcv_queues;
+
+       /*
+        * Interrupt affinity.
+        *
+        * non-rcv avail gets a default mask that
+        * starts as possible cpus with threads reset
+        * and each rcv avail reset.
+        *
+        * rcv avail gets node relative 1 wrapping back
+        * to the node relative 1 as necessary.
+        *
+        */
+       local_mask = cpumask_of_pcibus(dd->pcidev->bus);
+       /* if first cpu is invalid, use NUMA 0 */
+       if (cpumask_first(local_mask) >= nr_cpu_ids)
+               local_mask = topology_core_cpumask(0);
+
+       def_ret = zalloc_cpumask_var(&def, GFP_KERNEL);
+       rcv_ret = zalloc_cpumask_var(&rcv, GFP_KERNEL);
+       if (!def_ret || !rcv_ret)
+               goto bail;
+       /* use local mask as default */
+       cpumask_copy(def, local_mask);
+       possible = cpumask_weight(def);
+       /* disarm threads from default */
+       ht = cpumask_weight(
+                       topology_sibling_cpumask(cpumask_first(local_mask)));
+       for (i = possible/ht; i < possible; i++)
+               cpumask_clear_cpu(i, def);
+       /* reset possible */
+       possible = cpumask_weight(def);
+       /* def now has full cores on chosen node*/
+       first_cpu = cpumask_first(def);
+       if (nr_cpu_ids >= first_cpu)
+               first_cpu++;
+       restart_cpu = first_cpu;
+       curr_cpu = restart_cpu;
+
+       for (i = first_cpu; i < dd->n_krcv_queues + first_cpu; i++) {
+               cpumask_clear_cpu(curr_cpu, def);
+               cpumask_set_cpu(curr_cpu, rcv);
+               if (curr_cpu >= possible)
+                       curr_cpu = restart_cpu;
+               else
+                       curr_cpu++;
+       }
+       /* def mask has non-rcv, rcv has recv mask */
+       rcv_cpu = cpumask_first(rcv);
+       sdma_cpu = cpumask_first(def);
+
+       /*
+        * Sanity check - the code expects all SDMA chip source
+        * interrupts to be in the same CSR, starting at bit 0.  Verify
+        * that this is true by checking the bit location of the start.
+        */
+       BUILD_BUG_ON(IS_SDMA_START % 64);
+
+       for (i = 0; i < dd->num_msix_entries; i++) {
+               struct hfi1_msix_entry *me = &dd->msix_entries[i];
+               const char *err_info;
+               irq_handler_t handler;
+               irq_handler_t thread = NULL;
+               void *arg;
+               int idx;
+               struct hfi1_ctxtdata *rcd = NULL;
+               struct sdma_engine *sde = NULL;
+
+               /* obtain the arguments to request_irq */
+               if (first_general <= i && i < last_general) {
+                       idx = i - first_general;
+                       handler = general_interrupt;
+                       arg = dd;
+                       snprintf(me->name, sizeof(me->name),
+                               DRIVER_NAME"_%d", dd->unit);
+                       err_info = "general";
+               } else if (first_sdma <= i && i < last_sdma) {
+                       idx = i - first_sdma;
+                       sde = &dd->per_sdma[idx];
+                       handler = sdma_interrupt;
+                       arg = sde;
+                       snprintf(me->name, sizeof(me->name),
+                               DRIVER_NAME"_%d sdma%d", dd->unit, idx);
+                       err_info = "sdma";
+                       remap_sdma_interrupts(dd, idx, i);
+               } else if (first_rx <= i && i < last_rx) {
+                       idx = i - first_rx;
+                       rcd = dd->rcd[idx];
+                       /* no interrupt if no rcd */
+                       if (!rcd)
+                               continue;
+                       /*
+                        * Set the interrupt register and mask for this
+                        * context's interrupt.
+                        */
+                       rcd->ireg = (IS_RCVAVAIL_START+idx) / 64;
+                       rcd->imask = ((u64)1) <<
+                                       ((IS_RCVAVAIL_START+idx) % 64);
+                       handler = receive_context_interrupt;
+                       thread = receive_context_thread;
+                       arg = rcd;
+                       snprintf(me->name, sizeof(me->name),
+                               DRIVER_NAME"_%d kctxt%d", dd->unit, idx);
+                       err_info = "receive context";
+                       remap_receive_available_interrupt(dd, idx, i);
+               } else {
+                       /* not in our expected range - complain, then
+                          ignore it */
+                       dd_dev_err(dd,
+                               "Unexpected extra MSI-X interrupt %d\n", i);
+                       continue;
+               }
+               /* no argument, no interrupt */
+               if (arg == NULL)
+                       continue;
+               /* make sure the name is terminated */
+               me->name[sizeof(me->name)-1] = 0;
+
+               ret = request_threaded_irq(me->msix.vector, handler, thread, 0,
+                                               me->name, arg);
+               if (ret) {
+                       dd_dev_err(dd,
+                               "unable to allocate %s interrupt, vector %d, index %d, err %d\n",
+                                err_info, me->msix.vector, idx, ret);
+                       return ret;
+               }
+               /*
+                * assign arg after request_irq call, so it will be
+                * cleaned up
+                */
+               me->arg = arg;
+
+               if (!zalloc_cpumask_var(
+                       &dd->msix_entries[i].mask,
+                       GFP_KERNEL))
+                       goto bail;
+               if (handler == sdma_interrupt) {
+                       dd_dev_info(dd, "sdma engine %d cpu %d\n",
+                               sde->this_idx, sdma_cpu);
+                       cpumask_set_cpu(sdma_cpu, dd->msix_entries[i].mask);
+                       sdma_cpu = cpumask_next(sdma_cpu, def);
+                       if (sdma_cpu >= nr_cpu_ids)
+                               sdma_cpu = cpumask_first(def);
+               } else if (handler == receive_context_interrupt) {
+                       dd_dev_info(dd, "rcv ctxt %d cpu %d\n",
+                               rcd->ctxt, rcv_cpu);
+                       cpumask_set_cpu(rcv_cpu, dd->msix_entries[i].mask);
+                       rcv_cpu = cpumask_next(rcv_cpu, rcv);
+                       if (rcv_cpu >= nr_cpu_ids)
+                               rcv_cpu = cpumask_first(rcv);
+               } else {
+                       /* otherwise first def */
+                       dd_dev_info(dd, "%s cpu %d\n",
+                               err_info, cpumask_first(def));
+                       cpumask_set_cpu(
+                               cpumask_first(def), dd->msix_entries[i].mask);
+               }
+               irq_set_affinity_hint(
+                       dd->msix_entries[i].msix.vector,
+                       dd->msix_entries[i].mask);
+       }
+
+out:
+       free_cpumask_var(def);
+       free_cpumask_var(rcv);
+       return ret;
+bail:
+       ret = -ENOMEM;
+       goto  out;
+}
+
+/*
+ * Set the general handler to accept all interrupts, remap all
+ * chip interrupts back to MSI-X 0.
+ */
+static void reset_interrupts(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /* all interrupts handled by the general handler */
+       for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+               dd->gi_mask[i] = ~(u64)0;
+
+       /* all chip interrupts map to MSI-X 0 */
+       for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++)
+               write_csr(dd, CCE_INT_MAP + (8*i), 0);
+}
+
+static int set_up_interrupts(struct hfi1_devdata *dd)
+{
+       struct hfi1_msix_entry *entries;
+       u32 total, request;
+       int i, ret;
+       int single_interrupt = 0; /* we expect to have all the interrupts */
+
+       /*
+        * Interrupt count:
+        *      1 general, "slow path" interrupt (includes the SDMA engines
+        *              slow source, SDMACleanupDone)
+        *      N interrupts - one per used SDMA engine
+        *      M interrupt - one per kernel receive context
+        */
+       total = 1 + dd->num_sdma + dd->n_krcv_queues;
+
+       entries = kcalloc(total, sizeof(*entries), GFP_KERNEL);
+       if (!entries) {
+               ret = -ENOMEM;
+               goto fail;
+       }
+       /* 1-1 MSI-X entry assignment */
+       for (i = 0; i < total; i++)
+               entries[i].msix.entry = i;
+
+       /* ask for MSI-X interrupts */
+       request = total;
+       request_msix(dd, &request, entries);
+
+       if (request == 0) {
+               /* using INTx */
+               /* dd->num_msix_entries already zero */
+               kfree(entries);
+               single_interrupt = 1;
+               dd_dev_err(dd, "MSI-X failed, using INTx interrupts\n");
+       } else {
+               /* using MSI-X */
+               dd->num_msix_entries = request;
+               dd->msix_entries = entries;
+
+               if (request != total) {
+                       /* using MSI-X, with reduced interrupts */
+                       dd_dev_err(
+                               dd,
+                               "cannot handle reduced interrupt case, want %u, got %u\n",
+                               total, request);
+                       ret = -EINVAL;
+                       goto fail;
+               }
+               dd_dev_info(dd, "%u MSI-X interrupts allocated\n", total);
+       }
+
+       /* mask all interrupts */
+       set_intr_state(dd, 0);
+       /* clear all pending interrupts */
+       clear_all_interrupts(dd);
+
+       /* reset general handler mask, chip MSI-X mappings */
+       reset_interrupts(dd);
+
+       if (single_interrupt)
+               ret = request_intx_irq(dd);
+       else
+               ret = request_msix_irqs(dd);
+       if (ret)
+               goto fail;
+
+       return 0;
+
+fail:
+       clean_up_interrupts(dd);
+       return ret;
+}
+
+/*
+ * Set up context values in dd.  Sets:
+ *
+ *     num_rcv_contexts - number of contexts being used
+ *     n_krcv_queues - number of kernel contexts
+ *     first_user_ctxt - first non-kernel context in array of contexts
+ *     freectxts  - number of free user contexts
+ *     num_send_contexts - number of PIO send contexts being used
+ */
+static int set_up_context_variables(struct hfi1_devdata *dd)
+{
+       int num_kernel_contexts;
+       int num_user_contexts;
+       int total_contexts;
+       int ret;
+       unsigned ngroups;
+
+       /*
+        * Kernel contexts: (to be fixed later):
+        * - min or 2 or 1 context/numa
+        * - Context 0 - default/errors
+        * - Context 1 - VL15
+        */
+       if (n_krcvqs)
+               num_kernel_contexts = n_krcvqs + MIN_KERNEL_KCTXTS;
+       else
+               num_kernel_contexts = num_online_nodes();
+       num_kernel_contexts =
+               max_t(int, MIN_KERNEL_KCTXTS, num_kernel_contexts);
+       /*
+        * Every kernel receive context needs an ACK send context.
+        * one send context is allocated for each VL{0-7} and VL15
+        */
+       if (num_kernel_contexts > (dd->chip_send_contexts - num_vls - 1)) {
+               dd_dev_err(dd,
+                          "Reducing # kernel rcv contexts to: %d, from %d\n",
+                          (int)(dd->chip_send_contexts - num_vls - 1),
+                          (int)num_kernel_contexts);
+               num_kernel_contexts = dd->chip_send_contexts - num_vls - 1;
+       }
+       /*
+        * User contexts: (to be fixed later)
+        *      - set to num_rcv_contexts if non-zero
+        *      - default to 1 user context per CPU
+        */
+       if (num_rcv_contexts)
+               num_user_contexts = num_rcv_contexts;
+       else
+               num_user_contexts = num_online_cpus();
+
+       total_contexts = num_kernel_contexts + num_user_contexts;
+
+       /*
+        * Adjust the counts given a global max.
+        */
+       if (total_contexts > dd->chip_rcv_contexts) {
+               dd_dev_err(dd,
+                          "Reducing # user receive contexts to: %d, from %d\n",
+                          (int)(dd->chip_rcv_contexts - num_kernel_contexts),
+                          (int)num_user_contexts);
+               num_user_contexts = dd->chip_rcv_contexts - num_kernel_contexts;
+               /* recalculate */
+               total_contexts = num_kernel_contexts + num_user_contexts;
+       }
+
+       /* the first N are kernel contexts, the rest are user contexts */
+       dd->num_rcv_contexts = total_contexts;
+       dd->n_krcv_queues = num_kernel_contexts;
+       dd->first_user_ctxt = num_kernel_contexts;
+       dd->freectxts = num_user_contexts;
+       dd_dev_info(dd,
+               "rcv contexts: chip %d, used %d (kernel %d, user %d)\n",
+               (int)dd->chip_rcv_contexts,
+               (int)dd->num_rcv_contexts,
+               (int)dd->n_krcv_queues,
+               (int)dd->num_rcv_contexts - dd->n_krcv_queues);
+
+       /*
+        * Receive array allocation:
+        *   All RcvArray entries are divided into groups of 8. This
+        *   is required by the hardware and will speed up writes to
+        *   consecutive entries by using write-combining of the entire
+        *   cacheline.
+        *
+        *   The number of groups are evenly divided among all contexts.
+        *   any left over groups will be given to the first N user
+        *   contexts.
+        */
+       dd->rcv_entries.group_size = RCV_INCREMENT;
+       ngroups = dd->chip_rcv_array_count / dd->rcv_entries.group_size;
+       dd->rcv_entries.ngroups = ngroups / dd->num_rcv_contexts;
+       dd->rcv_entries.nctxt_extra = ngroups -
+               (dd->num_rcv_contexts * dd->rcv_entries.ngroups);
+       dd_dev_info(dd, "RcvArray groups %u, ctxts extra %u\n",
+                   dd->rcv_entries.ngroups,
+                   dd->rcv_entries.nctxt_extra);
+       if (dd->rcv_entries.ngroups * dd->rcv_entries.group_size >
+           MAX_EAGER_ENTRIES * 2) {
+               dd->rcv_entries.ngroups = (MAX_EAGER_ENTRIES * 2) /
+                       dd->rcv_entries.group_size;
+               dd_dev_info(dd,
+                  "RcvArray group count too high, change to %u\n",
+                  dd->rcv_entries.ngroups);
+               dd->rcv_entries.nctxt_extra = 0;
+       }
+       /*
+        * PIO send contexts
+        */
+       ret = init_sc_pools_and_sizes(dd);
+       if (ret >= 0) { /* success */
+               dd->num_send_contexts = ret;
+               dd_dev_info(
+                       dd,
+                       "send contexts: chip %d, used %d (kernel %d, ack %d, user %d)\n",
+                       dd->chip_send_contexts,
+                       dd->num_send_contexts,
+                       dd->sc_sizes[SC_KERNEL].count,
+                       dd->sc_sizes[SC_ACK].count,
+                       dd->sc_sizes[SC_USER].count);
+               ret = 0;        /* success */
+       }
+
+       return ret;
+}
+
+/*
+ * Set the device/port partition key table. The MAD code
+ * will ensure that, at least, the partial management
+ * partition key is present in the table.
+ */
+static void set_partition_keys(struct hfi1_pportdata *ppd)
+{
+       struct hfi1_devdata *dd = ppd->dd;
+       u64 reg = 0;
+       int i;
+
+       dd_dev_info(dd, "Setting partition keys\n");
+       for (i = 0; i < hfi1_get_npkeys(dd); i++) {
+               reg |= (ppd->pkeys[i] &
+                       RCV_PARTITION_KEY_PARTITION_KEY_A_MASK) <<
+                       ((i % 4) *
+                        RCV_PARTITION_KEY_PARTITION_KEY_B_SHIFT);
+               /* Each register holds 4 PKey values. */
+               if ((i % 4) == 3) {
+                       write_csr(dd, RCV_PARTITION_KEY +
+                                 ((i - 3) * 2), reg);
+                       reg = 0;
+               }
+       }
+
+       /* Always enable HW pkeys check when pkeys table is set */
+       add_rcvctrl(dd, RCV_CTRL_RCV_PARTITION_KEY_ENABLE_SMASK);
+}
+
+/*
+ * These CSRs and memories are uninitialized on reset and must be
+ * written before reading to set the ECC/parity bits.
+ *
+ * NOTE: All user context CSRs that are not mmaped write-only
+ * (e.g. the TID flows) must be initialized even if the driver never
+ * reads them.
+ */
+static void write_uninitialized_csrs_and_memories(struct hfi1_devdata *dd)
+{
+       int i, j;
+
+       /* CceIntMap */
+       for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++)
+               write_csr(dd, CCE_INT_MAP+(8*i), 0);
+
+       /* SendCtxtCreditReturnAddr */
+       for (i = 0; i < dd->chip_send_contexts; i++)
+               write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_RETURN_ADDR, 0);
+
+       /* PIO Send buffers */
+       /* SDMA Send buffers */
+       /* These are not normally read, and (presently) have no method
+          to be read, so are not pre-initialized */
+
+       /* RcvHdrAddr */
+       /* RcvHdrTailAddr */
+       /* RcvTidFlowTable */
+       for (i = 0; i < dd->chip_rcv_contexts; i++) {
+               write_kctxt_csr(dd, i, RCV_HDR_ADDR, 0);
+               write_kctxt_csr(dd, i, RCV_HDR_TAIL_ADDR, 0);
+               for (j = 0; j < RXE_NUM_TID_FLOWS; j++)
+                       write_uctxt_csr(dd, i, RCV_TID_FLOW_TABLE+(8*j), 0);
+       }
+
+       /* RcvArray */
+       for (i = 0; i < dd->chip_rcv_array_count; i++)
+               write_csr(dd, RCV_ARRAY + (8*i),
+                                       RCV_ARRAY_RT_WRITE_ENABLE_SMASK);
+
+       /* RcvQPMapTable */
+       for (i = 0; i < 32; i++)
+               write_csr(dd, RCV_QP_MAP_TABLE + (8 * i), 0);
+}
+
+/*
+ * Use the ctrl_bits in CceCtrl to clear the status_bits in CceStatus.
+ */
+static void clear_cce_status(struct hfi1_devdata *dd, u64 status_bits,
+                            u64 ctrl_bits)
+{
+       unsigned long timeout;
+       u64 reg;
+
+       /* is the condition present? */
+       reg = read_csr(dd, CCE_STATUS);
+       if ((reg & status_bits) == 0)
+               return;
+
+       /* clear the condition */
+       write_csr(dd, CCE_CTRL, ctrl_bits);
+
+       /* wait for the condition to clear */
+       timeout = jiffies + msecs_to_jiffies(CCE_STATUS_TIMEOUT);
+       while (1) {
+               reg = read_csr(dd, CCE_STATUS);
+               if ((reg & status_bits) == 0)
+                       return;
+               if (time_after(jiffies, timeout)) {
+                       dd_dev_err(dd,
+                               "Timeout waiting for CceStatus to clear bits 0x%llx, remaining 0x%llx\n",
+                               status_bits, reg & status_bits);
+                       return;
+               }
+               udelay(1);
+       }
+}
+
+/* set CCE CSRs to chip reset defaults */
+static void reset_cce_csrs(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /* CCE_REVISION read-only */
+       /* CCE_REVISION2 read-only */
+       /* CCE_CTRL - bits clear automatically */
+       /* CCE_STATUS read-only, use CceCtrl to clear */
+       clear_cce_status(dd, ALL_FROZE, CCE_CTRL_SPC_UNFREEZE_SMASK);
+       clear_cce_status(dd, ALL_TXE_PAUSE, CCE_CTRL_TXE_RESUME_SMASK);
+       clear_cce_status(dd, ALL_RXE_PAUSE, CCE_CTRL_RXE_RESUME_SMASK);
+       for (i = 0; i < CCE_NUM_SCRATCH; i++)
+               write_csr(dd, CCE_SCRATCH + (8 * i), 0);
+       /* CCE_ERR_STATUS read-only */
+       write_csr(dd, CCE_ERR_MASK, 0);
+       write_csr(dd, CCE_ERR_CLEAR, ~0ull);
+       /* CCE_ERR_FORCE leave alone */
+       for (i = 0; i < CCE_NUM_32_BIT_COUNTERS; i++)
+               write_csr(dd, CCE_COUNTER_ARRAY32 + (8 * i), 0);
+       write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_RESETCSR);
+       /* CCE_PCIE_CTRL leave alone */
+       for (i = 0; i < CCE_NUM_MSIX_VECTORS; i++) {
+               write_csr(dd, CCE_MSIX_TABLE_LOWER + (8 * i), 0);
+               write_csr(dd, CCE_MSIX_TABLE_UPPER + (8 * i),
+                                       CCE_MSIX_TABLE_UPPER_RESETCSR);
+       }
+       for (i = 0; i < CCE_NUM_MSIX_PBAS; i++) {
+               /* CCE_MSIX_PBA read-only */
+               write_csr(dd, CCE_MSIX_INT_GRANTED, ~0ull);
+               write_csr(dd, CCE_MSIX_VEC_CLR_WITHOUT_INT, ~0ull);
+       }
+       for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++)
+               write_csr(dd, CCE_INT_MAP, 0);
+       for (i = 0; i < CCE_NUM_INT_CSRS; i++) {
+               /* CCE_INT_STATUS read-only */
+               write_csr(dd, CCE_INT_MASK + (8 * i), 0);
+               write_csr(dd, CCE_INT_CLEAR + (8 * i), ~0ull);
+               /* CCE_INT_FORCE leave alone */
+               /* CCE_INT_BLOCKED read-only */
+       }
+       for (i = 0; i < CCE_NUM_32_BIT_INT_COUNTERS; i++)
+               write_csr(dd, CCE_INT_COUNTER_ARRAY32 + (8 * i), 0);
+}
+
+/* set ASIC CSRs to chip reset defaults */
+static void reset_asic_csrs(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /*
+        * If the HFIs are shared between separate nodes or VMs,
+        * then more will need to be done here.  One idea is a module
+        * parameter that returns early, letting the first power-on or
+        * a known first load do the reset and blocking all others.
+        */
+
+       if (!(dd->flags & HFI1_DO_INIT_ASIC))
+               return;
+
+       if (dd->icode != ICODE_FPGA_EMULATION) {
+               /* emulation does not have an SBus - leave these alone */
+               /*
+                * All writes to ASIC_CFG_SBUS_REQUEST do something.
+                * Notes:
+                * o The reset is not zero if aimed at the core.  See the
+                *   SBus documentation for details.
+                * o If the SBus firmware has been updated (e.g. by the BIOS),
+                *   will the reset revert that?
+                */
+               /* ASIC_CFG_SBUS_REQUEST leave alone */
+               write_csr(dd, ASIC_CFG_SBUS_EXECUTE, 0);
+       }
+       /* ASIC_SBUS_RESULT read-only */
+       write_csr(dd, ASIC_STS_SBUS_COUNTERS, 0);
+       for (i = 0; i < ASIC_NUM_SCRATCH; i++)
+               write_csr(dd, ASIC_CFG_SCRATCH + (8 * i), 0);
+       write_csr(dd, ASIC_CFG_MUTEX, 0);       /* this will clear it */
+
+       /* We might want to retain this state across FLR if we ever use it */
+       write_csr(dd, ASIC_CFG_DRV_STR, 0);
+
+       write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0);
+       /* ASIC_STS_THERM read-only */
+       /* ASIC_CFG_RESET leave alone */
+
+       write_csr(dd, ASIC_PCIE_SD_HOST_CMD, 0);
+       /* ASIC_PCIE_SD_HOST_STATUS read-only */
+       write_csr(dd, ASIC_PCIE_SD_INTRPT_DATA_CODE, 0);
+       write_csr(dd, ASIC_PCIE_SD_INTRPT_ENABLE, 0);
+       /* ASIC_PCIE_SD_INTRPT_PROGRESS read-only */
+       write_csr(dd, ASIC_PCIE_SD_INTRPT_STATUS, ~0ull); /* clear */
+       /* ASIC_HFI0_PCIE_SD_INTRPT_RSPD_DATA read-only */
+       /* ASIC_HFI1_PCIE_SD_INTRPT_RSPD_DATA read-only */
+       for (i = 0; i < 16; i++)
+               write_csr(dd, ASIC_PCIE_SD_INTRPT_LIST + (8 * i), 0);
+
+       /* ASIC_GPIO_IN read-only */
+       write_csr(dd, ASIC_GPIO_OE, 0);
+       write_csr(dd, ASIC_GPIO_INVERT, 0);
+       write_csr(dd, ASIC_GPIO_OUT, 0);
+       write_csr(dd, ASIC_GPIO_MASK, 0);
+       /* ASIC_GPIO_STATUS read-only */
+       write_csr(dd, ASIC_GPIO_CLEAR, ~0ull);
+       /* ASIC_GPIO_FORCE leave alone */
+
+       /* ASIC_QSFP1_IN read-only */
+       write_csr(dd, ASIC_QSFP1_OE, 0);
+       write_csr(dd, ASIC_QSFP1_INVERT, 0);
+       write_csr(dd, ASIC_QSFP1_OUT, 0);
+       write_csr(dd, ASIC_QSFP1_MASK, 0);
+       /* ASIC_QSFP1_STATUS read-only */
+       write_csr(dd, ASIC_QSFP1_CLEAR, ~0ull);
+       /* ASIC_QSFP1_FORCE leave alone */
+
+       /* ASIC_QSFP2_IN read-only */
+       write_csr(dd, ASIC_QSFP2_OE, 0);
+       write_csr(dd, ASIC_QSFP2_INVERT, 0);
+       write_csr(dd, ASIC_QSFP2_OUT, 0);
+       write_csr(dd, ASIC_QSFP2_MASK, 0);
+       /* ASIC_QSFP2_STATUS read-only */
+       write_csr(dd, ASIC_QSFP2_CLEAR, ~0ull);
+       /* ASIC_QSFP2_FORCE leave alone */
+
+       write_csr(dd, ASIC_EEP_CTL_STAT, ASIC_EEP_CTL_STAT_RESETCSR);
+       /* this also writes a NOP command, clearing paging mode */
+       write_csr(dd, ASIC_EEP_ADDR_CMD, 0);
+       write_csr(dd, ASIC_EEP_DATA, 0);
+}
+
+/* set MISC CSRs to chip reset defaults */
+static void reset_misc_csrs(struct hfi1_devdata *dd)
+{
+       int i;
+
+       for (i = 0; i < 32; i++) {
+               write_csr(dd, MISC_CFG_RSA_R2 + (8 * i), 0);
+               write_csr(dd, MISC_CFG_RSA_SIGNATURE + (8 * i), 0);
+               write_csr(dd, MISC_CFG_RSA_MODULUS + (8 * i), 0);
+       }
+       /* MISC_CFG_SHA_PRELOAD leave alone - always reads 0 and can
+          only be written 128-byte chunks */
+       /* init RSA engine to clear lingering errors */
+       write_csr(dd, MISC_CFG_RSA_CMD, 1);
+       write_csr(dd, MISC_CFG_RSA_MU, 0);
+       write_csr(dd, MISC_CFG_FW_CTRL, 0);
+       /* MISC_STS_8051_DIGEST read-only */
+       /* MISC_STS_SBM_DIGEST read-only */
+       /* MISC_STS_PCIE_DIGEST read-only */
+       /* MISC_STS_FAB_DIGEST read-only */
+       /* MISC_ERR_STATUS read-only */
+       write_csr(dd, MISC_ERR_MASK, 0);
+       write_csr(dd, MISC_ERR_CLEAR, ~0ull);
+       /* MISC_ERR_FORCE leave alone */
+}
+
+/* set TXE CSRs to chip reset defaults */
+static void reset_txe_csrs(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /*
+        * TXE Kernel CSRs
+        */
+       write_csr(dd, SEND_CTRL, 0);
+       __cm_reset(dd, 0);      /* reset CM internal state */
+       /* SEND_CONTEXTS read-only */
+       /* SEND_DMA_ENGINES read-only */
+       /* SEND_PIO_MEM_SIZE read-only */
+       /* SEND_DMA_MEM_SIZE read-only */
+       write_csr(dd, SEND_HIGH_PRIORITY_LIMIT, 0);
+       pio_reset_all(dd);      /* SEND_PIO_INIT_CTXT */
+       /* SEND_PIO_ERR_STATUS read-only */
+       write_csr(dd, SEND_PIO_ERR_MASK, 0);
+       write_csr(dd, SEND_PIO_ERR_CLEAR, ~0ull);
+       /* SEND_PIO_ERR_FORCE leave alone */
+       /* SEND_DMA_ERR_STATUS read-only */
+       write_csr(dd, SEND_DMA_ERR_MASK, 0);
+       write_csr(dd, SEND_DMA_ERR_CLEAR, ~0ull);
+       /* SEND_DMA_ERR_FORCE leave alone */
+       /* SEND_EGRESS_ERR_STATUS read-only */
+       write_csr(dd, SEND_EGRESS_ERR_MASK, 0);
+       write_csr(dd, SEND_EGRESS_ERR_CLEAR, ~0ull);
+       /* SEND_EGRESS_ERR_FORCE leave alone */
+       write_csr(dd, SEND_BTH_QP, 0);
+       write_csr(dd, SEND_STATIC_RATE_CONTROL, 0);
+       write_csr(dd, SEND_SC2VLT0, 0);
+       write_csr(dd, SEND_SC2VLT1, 0);
+       write_csr(dd, SEND_SC2VLT2, 0);
+       write_csr(dd, SEND_SC2VLT3, 0);
+       write_csr(dd, SEND_LEN_CHECK0, 0);
+       write_csr(dd, SEND_LEN_CHECK1, 0);
+       /* SEND_ERR_STATUS read-only */
+       write_csr(dd, SEND_ERR_MASK, 0);
+       write_csr(dd, SEND_ERR_CLEAR, ~0ull);
+       /* SEND_ERR_FORCE read-only */
+       for (i = 0; i < VL_ARB_LOW_PRIO_TABLE_SIZE; i++)
+               write_csr(dd, SEND_LOW_PRIORITY_LIST + (8*i), 0);
+       for (i = 0; i < VL_ARB_HIGH_PRIO_TABLE_SIZE; i++)
+               write_csr(dd, SEND_HIGH_PRIORITY_LIST + (8*i), 0);
+       for (i = 0; i < dd->chip_send_contexts/NUM_CONTEXTS_PER_SET; i++)
+               write_csr(dd, SEND_CONTEXT_SET_CTRL + (8*i), 0);
+       for (i = 0; i < TXE_NUM_32_BIT_COUNTER; i++)
+               write_csr(dd, SEND_COUNTER_ARRAY32 + (8*i), 0);
+       for (i = 0; i < TXE_NUM_64_BIT_COUNTER; i++)
+               write_csr(dd, SEND_COUNTER_ARRAY64 + (8*i), 0);
+       write_csr(dd, SEND_CM_CTRL, SEND_CM_CTRL_RESETCSR);
+       write_csr(dd, SEND_CM_GLOBAL_CREDIT,
+                                       SEND_CM_GLOBAL_CREDIT_RESETCSR);
+       /* SEND_CM_CREDIT_USED_STATUS read-only */
+       write_csr(dd, SEND_CM_TIMER_CTRL, 0);
+       write_csr(dd, SEND_CM_LOCAL_AU_TABLE0_TO3, 0);
+       write_csr(dd, SEND_CM_LOCAL_AU_TABLE4_TO7, 0);
+       write_csr(dd, SEND_CM_REMOTE_AU_TABLE0_TO3, 0);
+       write_csr(dd, SEND_CM_REMOTE_AU_TABLE4_TO7, 0);
+       for (i = 0; i < TXE_NUM_DATA_VL; i++)
+               write_csr(dd, SEND_CM_CREDIT_VL + (8*i), 0);
+       write_csr(dd, SEND_CM_CREDIT_VL15, 0);
+       /* SEND_CM_CREDIT_USED_VL read-only */
+       /* SEND_CM_CREDIT_USED_VL15 read-only */
+       /* SEND_EGRESS_CTXT_STATUS read-only */
+       /* SEND_EGRESS_SEND_DMA_STATUS read-only */
+       write_csr(dd, SEND_EGRESS_ERR_INFO, ~0ull);
+       /* SEND_EGRESS_ERR_INFO read-only */
+       /* SEND_EGRESS_ERR_SOURCE read-only */
+
+       /*
+        * TXE Per-Context CSRs
+        */
+       for (i = 0; i < dd->chip_send_contexts; i++) {
+               write_kctxt_csr(dd, i, SEND_CTXT_CTRL, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_CTRL, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_RETURN_ADDR, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_FORCE, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_ERR_MASK, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_ERR_CLEAR, ~0ull);
+               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_ENABLE, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_VL, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_JOB_KEY, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_PARTITION_KEY, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_SLID, 0);
+               write_kctxt_csr(dd, i, SEND_CTXT_CHECK_OPCODE, 0);
+       }
+
+       /*
+        * TXE Per-SDMA CSRs
+        */
+       for (i = 0; i < dd->chip_sdma_engines; i++) {
+               write_kctxt_csr(dd, i, SEND_DMA_CTRL, 0);
+               /* SEND_DMA_STATUS read-only */
+               write_kctxt_csr(dd, i, SEND_DMA_BASE_ADDR, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_LEN_GEN, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_TAIL, 0);
+               /* SEND_DMA_HEAD read-only */
+               write_kctxt_csr(dd, i, SEND_DMA_HEAD_ADDR, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_PRIORITY_THLD, 0);
+               /* SEND_DMA_IDLE_CNT read-only */
+               write_kctxt_csr(dd, i, SEND_DMA_RELOAD_CNT, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_DESC_CNT, 0);
+               /* SEND_DMA_DESC_FETCHED_CNT read-only */
+               /* SEND_DMA_ENG_ERR_STATUS read-only */
+               write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_MASK, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_CLEAR, ~0ull);
+               /* SEND_DMA_ENG_ERR_FORCE leave alone */
+               write_kctxt_csr(dd, i, SEND_DMA_CHECK_ENABLE, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_CHECK_VL, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_CHECK_JOB_KEY, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_CHECK_PARTITION_KEY, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_CHECK_SLID, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_CHECK_OPCODE, 0);
+               write_kctxt_csr(dd, i, SEND_DMA_MEMORY, 0);
+       }
+}
+
+/*
+ * Expect on entry:
+ * o Packet ingress is disabled, i.e. RcvCtrl.RcvPortEnable == 0
+ */
+static void init_rbufs(struct hfi1_devdata *dd)
+{
+       u64 reg;
+       int count;
+
+       /*
+        * Wait for DMA to stop: RxRbufPktPending and RxPktInProgress are
+        * clear.
+        */
+       count = 0;
+       while (1) {
+               reg = read_csr(dd, RCV_STATUS);
+               if ((reg & (RCV_STATUS_RX_RBUF_PKT_PENDING_SMASK
+                           | RCV_STATUS_RX_PKT_IN_PROGRESS_SMASK)) == 0)
+                       break;
+               /*
+                * Give up after 1ms - maximum wait time.
+                *
+                * RBuf size is 148KiB.  Slowest possible is PCIe Gen1 x1 at
+                * 250MB/s bandwidth.  Lower rate to 66% for overhead to get:
+                *      148 KB / (66% * 250MB/s) = 920us
+                */
+               if (count++ > 500) {
+                       dd_dev_err(dd,
+                               "%s: in-progress DMA not clearing: RcvStatus 0x%llx, continuing\n",
+                               __func__, reg);
+                       break;
+               }
+               udelay(2); /* do not busy-wait the CSR */
+       }
+
+       /* start the init - expect RcvCtrl to be 0 */
+       write_csr(dd, RCV_CTRL, RCV_CTRL_RX_RBUF_INIT_SMASK);
+
+       /*
+        * Read to force the write of Rcvtrl.RxRbufInit.  There is a brief
+        * period after the write before RcvStatus.RxRbufInitDone is valid.
+        * The delay in the first run through the loop below is sufficient and
+        * required before the first read of RcvStatus.RxRbufInintDone.
+        */
+       read_csr(dd, RCV_CTRL);
+
+       /* wait for the init to finish */
+       count = 0;
+       while (1) {
+               /* delay is required first time through - see above */
+               udelay(2); /* do not busy-wait the CSR */
+               reg = read_csr(dd, RCV_STATUS);
+               if (reg & (RCV_STATUS_RX_RBUF_INIT_DONE_SMASK))
+                       break;
+
+               /* give up after 100us - slowest possible at 33MHz is 73us */
+               if (count++ > 50) {
+                       dd_dev_err(dd,
+                               "%s: RcvStatus.RxRbufInit not set, continuing\n",
+                               __func__);
+                       break;
+               }
+       }
+}
+
+/* set RXE CSRs to chip reset defaults */
+static void reset_rxe_csrs(struct hfi1_devdata *dd)
+{
+       int i, j;
+
+       /*
+        * RXE Kernel CSRs
+        */
+       write_csr(dd, RCV_CTRL, 0);
+       init_rbufs(dd);
+       /* RCV_STATUS read-only */
+       /* RCV_CONTEXTS read-only */
+       /* RCV_ARRAY_CNT read-only */
+       /* RCV_BUF_SIZE read-only */
+       write_csr(dd, RCV_BTH_QP, 0);
+       write_csr(dd, RCV_MULTICAST, 0);
+       write_csr(dd, RCV_BYPASS, 0);
+       write_csr(dd, RCV_VL15, 0);
+       /* this is a clear-down */
+       write_csr(dd, RCV_ERR_INFO,
+                       RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK);
+       /* RCV_ERR_STATUS read-only */
+       write_csr(dd, RCV_ERR_MASK, 0);
+       write_csr(dd, RCV_ERR_CLEAR, ~0ull);
+       /* RCV_ERR_FORCE leave alone */
+       for (i = 0; i < 32; i++)
+               write_csr(dd, RCV_QP_MAP_TABLE + (8 * i), 0);
+       for (i = 0; i < 4; i++)
+               write_csr(dd, RCV_PARTITION_KEY + (8 * i), 0);
+       for (i = 0; i < RXE_NUM_32_BIT_COUNTERS; i++)
+               write_csr(dd, RCV_COUNTER_ARRAY32 + (8 * i), 0);
+       for (i = 0; i < RXE_NUM_64_BIT_COUNTERS; i++)
+               write_csr(dd, RCV_COUNTER_ARRAY64 + (8 * i), 0);
+       for (i = 0; i < RXE_NUM_RSM_INSTANCES; i++) {
+               write_csr(dd, RCV_RSM_CFG + (8 * i), 0);
+               write_csr(dd, RCV_RSM_SELECT + (8 * i), 0);
+               write_csr(dd, RCV_RSM_MATCH + (8 * i), 0);
+       }
+       for (i = 0; i < 32; i++)
+               write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), 0);
+
+       /*
+        * RXE Kernel and User Per-Context CSRs
+        */
+       for (i = 0; i < dd->chip_rcv_contexts; i++) {
+               /* kernel */
+               write_kctxt_csr(dd, i, RCV_CTXT_CTRL, 0);
+               /* RCV_CTXT_STATUS read-only */
+               write_kctxt_csr(dd, i, RCV_EGR_CTRL, 0);
+               write_kctxt_csr(dd, i, RCV_TID_CTRL, 0);
+               write_kctxt_csr(dd, i, RCV_KEY_CTRL, 0);
+               write_kctxt_csr(dd, i, RCV_HDR_ADDR, 0);
+               write_kctxt_csr(dd, i, RCV_HDR_CNT, 0);
+               write_kctxt_csr(dd, i, RCV_HDR_ENT_SIZE, 0);
+               write_kctxt_csr(dd, i, RCV_HDR_SIZE, 0);
+               write_kctxt_csr(dd, i, RCV_HDR_TAIL_ADDR, 0);
+               write_kctxt_csr(dd, i, RCV_AVAIL_TIME_OUT, 0);
+               write_kctxt_csr(dd, i, RCV_HDR_OVFL_CNT, 0);
+
+               /* user */
+               /* RCV_HDR_TAIL read-only */
+               write_uctxt_csr(dd, i, RCV_HDR_HEAD, 0);
+               /* RCV_EGR_INDEX_TAIL read-only */
+               write_uctxt_csr(dd, i, RCV_EGR_INDEX_HEAD, 0);
+               /* RCV_EGR_OFFSET_TAIL read-only */
+               for (j = 0; j < RXE_NUM_TID_FLOWS; j++) {
+                       write_uctxt_csr(dd, i, RCV_TID_FLOW_TABLE + (8 * j),
+                               0);
+               }
+       }
+}
+
+/*
+ * Set sc2vl tables.
+ *
+ * They power on to zeros, so to avoid send context errors
+ * they need to be set:
+ *
+ * SC 0-7 -> VL 0-7 (respectively)
+ * SC 15  -> VL 15
+ * otherwise
+ *        -> VL 0
+ */
+static void init_sc2vl_tables(struct hfi1_devdata *dd)
+{
+       int i;
+       /* init per architecture spec, constrained by hardware capability */
+
+       /* HFI maps sent packets */
+       write_csr(dd, SEND_SC2VLT0, SC2VL_VAL(
+               0,
+               0, 0, 1, 1,
+               2, 2, 3, 3,
+               4, 4, 5, 5,
+               6, 6, 7, 7));
+       write_csr(dd, SEND_SC2VLT1, SC2VL_VAL(
+               1,
+               8, 0, 9, 0,
+               10, 0, 11, 0,
+               12, 0, 13, 0,
+               14, 0, 15, 15));
+       write_csr(dd, SEND_SC2VLT2, SC2VL_VAL(
+               2,
+               16, 0, 17, 0,
+               18, 0, 19, 0,
+               20, 0, 21, 0,
+               22, 0, 23, 0));
+       write_csr(dd, SEND_SC2VLT3, SC2VL_VAL(
+               3,
+               24, 0, 25, 0,
+               26, 0, 27, 0,
+               28, 0, 29, 0,
+               30, 0, 31, 0));
+
+       /* DC maps received packets */
+       write_csr(dd, DCC_CFG_SC_VL_TABLE_15_0, DC_SC_VL_VAL(
+               15_0,
+               0, 0, 1, 1,  2, 2,  3, 3,  4, 4,  5, 5,  6, 6,  7,  7,
+               8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 15));
+       write_csr(dd, DCC_CFG_SC_VL_TABLE_31_16, DC_SC_VL_VAL(
+               31_16,
+               16, 0, 17, 0, 18, 0, 19, 0, 20, 0, 21, 0, 22, 0, 23, 0,
+               24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 31, 0));
+
+       /* initialize the cached sc2vl values consistently with h/w */
+       for (i = 0; i < 32; i++) {
+               if (i < 8 || i == 15)
+                       *((u8 *)(dd->sc2vl) + i) = (u8)i;
+               else
+                       *((u8 *)(dd->sc2vl) + i) = 0;
+       }
+}
+
+/*
+ * Read chip sizes and then reset parts to sane, disabled, values.  We cannot
+ * depend on the chip going through a power-on reset - a driver may be loaded
+ * and unloaded many times.
+ *
+ * Do not write any CSR values to the chip in this routine - there may be
+ * a reset following the (possible) FLR in this routine.
+ *
+ */
+static void init_chip(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /*
+        * Put the HFI CSRs in a known state.
+        * Combine this with a DC reset.
+        *
+        * Stop the device from doing anything while we do a
+        * reset.  We know there are no other active users of
+        * the device since we are now in charge.  Turn off
+        * off all outbound and inbound traffic and make sure
+        * the device does not generate any interrupts.
+        */
+
+       /* disable send contexts and SDMA engines */
+       write_csr(dd, SEND_CTRL, 0);
+       for (i = 0; i < dd->chip_send_contexts; i++)
+               write_kctxt_csr(dd, i, SEND_CTXT_CTRL, 0);
+       for (i = 0; i < dd->chip_sdma_engines; i++)
+               write_kctxt_csr(dd, i, SEND_DMA_CTRL, 0);
+       /* disable port (turn off RXE inbound traffic) and contexts */
+       write_csr(dd, RCV_CTRL, 0);
+       for (i = 0; i < dd->chip_rcv_contexts; i++)
+               write_csr(dd, RCV_CTXT_CTRL, 0);
+       /* mask all interrupt sources */
+       for (i = 0; i < CCE_NUM_INT_CSRS; i++)
+               write_csr(dd, CCE_INT_MASK + (8*i), 0ull);
+
+       /*
+        * DC Reset: do a full DC reset before the register clear.
+        * A recommended length of time to hold is one CSR read,
+        * so reread the CceDcCtrl.  Then, hold the DC in reset
+        * across the clear.
+        */
+       write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_DC_RESET_SMASK);
+       (void) read_csr(dd, CCE_DC_CTRL);
+
+       if (use_flr) {
+               /*
+                * A FLR will reset the SPC core and part of the PCIe.
+                * The parts that need to be restored have already been
+                * saved.
+                */
+               dd_dev_info(dd, "Resetting CSRs with FLR\n");
+
+               /* do the FLR, the DC reset will remain */
+               hfi1_pcie_flr(dd);
+
+               /* restore command and BARs */
+               restore_pci_variables(dd);
+
+               if (is_a0(dd)) {
+                       dd_dev_info(dd, "Resetting CSRs with FLR\n");
+                       hfi1_pcie_flr(dd);
+                       restore_pci_variables(dd);
+               }
+
+               reset_asic_csrs(dd);
+       } else {
+               dd_dev_info(dd, "Resetting CSRs with writes\n");
+               reset_cce_csrs(dd);
+               reset_txe_csrs(dd);
+               reset_rxe_csrs(dd);
+               reset_asic_csrs(dd);
+               reset_misc_csrs(dd);
+       }
+       /* clear the DC reset */
+       write_csr(dd, CCE_DC_CTRL, 0);
+
+       /* Set the LED off */
+       if (is_a0(dd))
+               setextled(dd, 0);
+       /*
+        * Clear the QSFP reset.
+        * A0 leaves the out lines floating on power on, then on an FLR
+        * enforces a 0 on all out pins.  The driver does not touch
+        * ASIC_QSFPn_OUT otherwise.  This leaves RESET_N low and
+        * anything  plugged constantly in reset, if it pays attention
+        * to RESET_N.
+        * A prime example of this is SiPh. For now, set all pins high.
+        * I2CCLK and I2CDAT will change per direction, and INT_N and
+        * MODPRS_N are input only and their value is ignored.
+        */
+       if (is_a0(dd)) {
+               write_csr(dd, ASIC_QSFP1_OUT, 0x1f);
+               write_csr(dd, ASIC_QSFP2_OUT, 0x1f);
+       }
+}
+
+static void init_early_variables(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /* assign link credit variables */
+       dd->vau = CM_VAU;
+       dd->link_credits = CM_GLOBAL_CREDITS;
+       if (is_a0(dd))
+               dd->link_credits--;
+       dd->vcu = cu_to_vcu(hfi1_cu);
+       /* enough room for 8 MAD packets plus header - 17K */
+       dd->vl15_init = (8 * (2048 + 128)) / vau_to_au(dd->vau);
+       if (dd->vl15_init > dd->link_credits)
+               dd->vl15_init = dd->link_credits;
+
+       write_uninitialized_csrs_and_memories(dd);
+
+       if (HFI1_CAP_IS_KSET(PKEY_CHECK))
+               for (i = 0; i < dd->num_pports; i++) {
+                       struct hfi1_pportdata *ppd = &dd->pport[i];
+
+                       set_partition_keys(ppd);
+               }
+       init_sc2vl_tables(dd);
+}
+
+static void init_kdeth_qp(struct hfi1_devdata *dd)
+{
+       /* user changed the KDETH_QP */
+       if (kdeth_qp != 0 && kdeth_qp >= 0xff) {
+               /* out of range or illegal value */
+               dd_dev_err(dd, "Invalid KDETH queue pair prefix, ignoring");
+               kdeth_qp = 0;
+       }
+       if (kdeth_qp == 0)      /* not set, or failed range check */
+               kdeth_qp = DEFAULT_KDETH_QP;
+
+       write_csr(dd, SEND_BTH_QP,
+                       (kdeth_qp & SEND_BTH_QP_KDETH_QP_MASK)
+                               << SEND_BTH_QP_KDETH_QP_SHIFT);
+
+       write_csr(dd, RCV_BTH_QP,
+                       (kdeth_qp & RCV_BTH_QP_KDETH_QP_MASK)
+                               << RCV_BTH_QP_KDETH_QP_SHIFT);
+}
+
+/**
+ * init_qpmap_table
+ * @dd - device data
+ * @first_ctxt - first context
+ * @last_ctxt - first context
+ *
+ * This return sets the qpn mapping table that
+ * is indexed by qpn[8:1].
+ *
+ * The routine will round robin the 256 settings
+ * from first_ctxt to last_ctxt.
+ *
+ * The first/last looks ahead to having specialized
+ * receive contexts for mgmt and bypass.  Normal
+ * verbs traffic will assumed to be on a range
+ * of receive contexts.
+ */
+static void init_qpmap_table(struct hfi1_devdata *dd,
+                            u32 first_ctxt,
+                            u32 last_ctxt)
+{
+       u64 reg = 0;
+       u64 regno = RCV_QP_MAP_TABLE;
+       int i;
+       u64 ctxt = first_ctxt;
+
+       for (i = 0; i < 256;) {
+               if (ctxt == VL15CTXT) {
+                       ctxt++;
+                       if (ctxt > last_ctxt)
+                               ctxt = first_ctxt;
+                       continue;
+               }
+               reg |= ctxt << (8 * (i % 8));
+               i++;
+               ctxt++;
+               if (ctxt > last_ctxt)
+                       ctxt = first_ctxt;
+               if (i % 8 == 0) {
+                       write_csr(dd, regno, reg);
+                       reg = 0;
+                       regno += 8;
+               }
+       }
+       if (i % 8)
+               write_csr(dd, regno, reg);
+
+       add_rcvctrl(dd, RCV_CTRL_RCV_QP_MAP_ENABLE_SMASK
+                       | RCV_CTRL_RCV_BYPASS_ENABLE_SMASK);
+}
+
+/**
+ * init_qos - init RX qos
+ * @dd - device data
+ * @first_context
+ *
+ * This routine initializes Rule 0 and the
+ * RSM map table to implement qos.
+ *
+ * If all of the limit tests succeed,
+ * qos is applied based on the array
+ * interpretation of krcvqs where
+ * entry 0 is VL0.
+ *
+ * The number of vl bits (n) and the number of qpn
+ * bits (m) are computed to feed both the RSM map table
+ * and the single rule.
+ *
+ */
+static void init_qos(struct hfi1_devdata *dd, u32 first_ctxt)
+{
+       u8 max_by_vl = 0;
+       unsigned qpns_per_vl, ctxt, i, qpn, n = 1, m;
+       u64 *rsmmap;
+       u64 reg;
+       u8  rxcontext = is_a0(dd) ? 0 : 0xff;  /* 0 is default if a0 ver. */
+
+       /* validate */
+       if (dd->n_krcv_queues <= MIN_KERNEL_KCTXTS ||
+           num_vls == 1 ||
+           krcvqsset <= 1)
+               goto bail;
+       for (i = 0; i < min_t(unsigned, num_vls, krcvqsset); i++)
+               if (krcvqs[i] > max_by_vl)
+                       max_by_vl = krcvqs[i];
+       if (max_by_vl > 32)
+               goto bail;
+       qpns_per_vl = __roundup_pow_of_two(max_by_vl);
+       /* determine bits vl */
+       n = ilog2(num_vls);
+       /* determine bits for qpn */
+       m = ilog2(qpns_per_vl);
+       if ((m + n) > 7)
+               goto bail;
+       if (num_vls * qpns_per_vl > dd->chip_rcv_contexts)
+               goto bail;
+       rsmmap = kmalloc_array(NUM_MAP_REGS, sizeof(u64), GFP_KERNEL);
+       memset(rsmmap, rxcontext, NUM_MAP_REGS * sizeof(u64));
+       /* init the local copy of the table */
+       for (i = 0, ctxt = first_ctxt; i < num_vls; i++) {
+               unsigned tctxt;
+
+               for (qpn = 0, tctxt = ctxt;
+                    krcvqs[i] && qpn < qpns_per_vl; qpn++) {
+                       unsigned idx, regoff, regidx;
+
+                       /* generate index <= 128 */
+                       idx = (qpn << n) ^ i;
+                       regoff = (idx % 8) * 8;
+                       regidx = idx / 8;
+                       reg = rsmmap[regidx];
+                       /* replace 0xff with context number */
+                       reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK
+                               << regoff);
+                       reg |= (u64)(tctxt++) << regoff;
+                       rsmmap[regidx] = reg;
+                       if (tctxt == ctxt + krcvqs[i])
+                               tctxt = ctxt;
+               }
+               ctxt += krcvqs[i];
+       }
+       /* flush cached copies to chip */
+       for (i = 0; i < NUM_MAP_REGS; i++)
+               write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), rsmmap[i]);
+       /* add rule0 */
+       write_csr(dd, RCV_RSM_CFG /* + (8 * 0) */,
+               RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK
+                       << RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT |
+               2ull << RCV_RSM_CFG_PACKET_TYPE_SHIFT);
+       write_csr(dd, RCV_RSM_SELECT /* + (8 * 0) */,
+               LRH_BTH_MATCH_OFFSET
+                       << RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT |
+               LRH_SC_MATCH_OFFSET << RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT |
+               LRH_SC_SELECT_OFFSET << RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT |
+               ((u64)n) << RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT |
+               QPN_SELECT_OFFSET << RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT |
+               ((u64)m + (u64)n) << RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT);
+       write_csr(dd, RCV_RSM_MATCH /* + (8 * 0) */,
+               LRH_BTH_MASK << RCV_RSM_MATCH_MASK1_SHIFT |
+               LRH_BTH_VALUE << RCV_RSM_MATCH_VALUE1_SHIFT |
+               LRH_SC_MASK << RCV_RSM_MATCH_MASK2_SHIFT |
+               LRH_SC_VALUE << RCV_RSM_MATCH_VALUE2_SHIFT);
+       /* Enable RSM */
+       add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK);
+       kfree(rsmmap);
+       /* map everything else (non-VL15) to context 0 */
+       init_qpmap_table(
+               dd,
+               0,
+               0);
+       dd->qos_shift = n + 1;
+       return;
+bail:
+       dd->qos_shift = 1;
+       init_qpmap_table(
+               dd,
+               dd->n_krcv_queues > MIN_KERNEL_KCTXTS ? MIN_KERNEL_KCTXTS : 0,
+               dd->n_krcv_queues - 1);
+}
+
+static void init_rxe(struct hfi1_devdata *dd)
+{
+       /* enable all receive errors */
+       write_csr(dd, RCV_ERR_MASK, ~0ull);
+       /* setup QPN map table - start where VL15 context leaves off */
+       init_qos(
+               dd,
+               dd->n_krcv_queues > MIN_KERNEL_KCTXTS ? MIN_KERNEL_KCTXTS : 0);
+       /*
+        * make sure RcvCtrl.RcvWcb <= PCIe Device Control
+        * Register Max_Payload_Size (PCI_EXP_DEVCTL in Linux PCIe config
+        * space, PciCfgCap2.MaxPayloadSize in HFI).  There is only one
+        * invalid configuration: RcvCtrl.RcvWcb set to its max of 256 and
+        * Max_PayLoad_Size set to its minimum of 128.
+        *
+        * Presently, RcvCtrl.RcvWcb is not modified from its default of 0
+        * (64 bytes).  Max_Payload_Size is possibly modified upward in
+        * tune_pcie_caps() which is called after this routine.
+        */
+}
+
+static void init_other(struct hfi1_devdata *dd)
+{
+       /* enable all CCE errors */
+       write_csr(dd, CCE_ERR_MASK, ~0ull);
+       /* enable *some* Misc errors */
+       write_csr(dd, MISC_ERR_MASK, DRIVER_MISC_MASK);
+       /* enable all DC errors, except LCB */
+       write_csr(dd, DCC_ERR_FLG_EN, ~0ull);
+       write_csr(dd, DC_DC8051_ERR_EN, ~0ull);
+}
+
+/*
+ * Fill out the given AU table using the given CU.  A CU is defined in terms
+ * AUs.  The table is a an encoding: given the index, how many AUs does that
+ * represent?
+ *
+ * NOTE: Assumes that the register layout is the same for the
+ * local and remote tables.
+ */
+static void assign_cm_au_table(struct hfi1_devdata *dd, u32 cu,
+                              u32 csr0to3, u32 csr4to7)
+{
+       write_csr(dd, csr0to3,
+                  0ull <<
+                       SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE0_SHIFT
+               |  1ull <<
+                       SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE1_SHIFT
+               |  2ull * cu <<
+                       SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE2_SHIFT
+               |  4ull * cu <<
+                       SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE3_SHIFT);
+       write_csr(dd, csr4to7,
+                  8ull * cu <<
+                       SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE4_SHIFT
+               | 16ull * cu <<
+                       SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE5_SHIFT
+               | 32ull * cu <<
+                       SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE6_SHIFT
+               | 64ull * cu <<
+                       SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE7_SHIFT);
+
+}
+
+static void assign_local_cm_au_table(struct hfi1_devdata *dd, u8 vcu)
+{
+       assign_cm_au_table(dd, vcu_to_cu(vcu), SEND_CM_LOCAL_AU_TABLE0_TO3,
+                                       SEND_CM_LOCAL_AU_TABLE4_TO7);
+}
+
+void assign_remote_cm_au_table(struct hfi1_devdata *dd, u8 vcu)
+{
+       assign_cm_au_table(dd, vcu_to_cu(vcu), SEND_CM_REMOTE_AU_TABLE0_TO3,
+                                       SEND_CM_REMOTE_AU_TABLE4_TO7);
+}
+
+static void init_txe(struct hfi1_devdata *dd)
+{
+       int i;
+
+       /* enable all PIO, SDMA, general, and Egress errors */
+       write_csr(dd, SEND_PIO_ERR_MASK, ~0ull);
+       write_csr(dd, SEND_DMA_ERR_MASK, ~0ull);
+       write_csr(dd, SEND_ERR_MASK, ~0ull);
+       write_csr(dd, SEND_EGRESS_ERR_MASK, ~0ull);
+
+       /* enable all per-context and per-SDMA engine errors */
+       for (i = 0; i < dd->chip_send_contexts; i++)
+               write_kctxt_csr(dd, i, SEND_CTXT_ERR_MASK, ~0ull);
+       for (i = 0; i < dd->chip_sdma_engines; i++)
+               write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_MASK, ~0ull);
+
+       /* set the local CU to AU mapping */
+       assign_local_cm_au_table(dd, dd->vcu);
+
+       /*
+        * Set reasonable default for Credit Return Timer
+        * Don't set on Simulator - causes it to choke.
+        */
+       if (dd->icode != ICODE_FUNCTIONAL_SIMULATOR)
+               write_csr(dd, SEND_CM_TIMER_CTRL, HFI1_CREDIT_RETURN_RATE);
+}
+
+int hfi1_set_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt, u16 jkey)
+{
+       struct hfi1_ctxtdata *rcd = dd->rcd[ctxt];
+       unsigned sctxt;
+       int ret = 0;
+       u64 reg;
+
+       if (!rcd || !rcd->sc) {
+               ret = -EINVAL;
+               goto done;
+       }
+       sctxt = rcd->sc->hw_context;
+       reg = SEND_CTXT_CHECK_JOB_KEY_MASK_SMASK | /* mask is always 1's */
+               ((jkey & SEND_CTXT_CHECK_JOB_KEY_VALUE_MASK) <<
+                SEND_CTXT_CHECK_JOB_KEY_VALUE_SHIFT);
+       /* JOB_KEY_ALLOW_PERMISSIVE is not allowed by default */
+       if (HFI1_CAP_KGET_MASK(rcd->flags, ALLOW_PERM_JKEY))
+               reg |= SEND_CTXT_CHECK_JOB_KEY_ALLOW_PERMISSIVE_SMASK;
+       write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_JOB_KEY, reg);
+       /*
+        * Enable send-side J_KEY integrity check, unless this is A0 h/w
+        * (due to A0 erratum).
+        */
+       if (!is_a0(dd)) {
+               reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
+               reg |= SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
+               write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
+       }
+
+       /* Enable J_KEY check on receive context. */
+       reg = RCV_KEY_CTRL_JOB_KEY_ENABLE_SMASK |
+               ((jkey & RCV_KEY_CTRL_JOB_KEY_VALUE_MASK) <<
+                RCV_KEY_CTRL_JOB_KEY_VALUE_SHIFT);
+       write_kctxt_csr(dd, ctxt, RCV_KEY_CTRL, reg);
+done:
+       return ret;
+}
+
+int hfi1_clear_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt)
+{
+       struct hfi1_ctxtdata *rcd = dd->rcd[ctxt];
+       unsigned sctxt;
+       int ret = 0;
+       u64 reg;
+
+       if (!rcd || !rcd->sc) {
+               ret = -EINVAL;
+               goto done;
+       }
+       sctxt = rcd->sc->hw_context;
+       write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_JOB_KEY, 0);
+       /*
+        * Disable send-side J_KEY integrity check, unless this is A0 h/w.
+        * This check would not have been enabled for A0 h/w, see
+        * set_ctxt_jkey().
+        */
+       if (!is_a0(dd)) {
+               reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
+               reg &= ~SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK;
+               write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
+       }
+       /* Turn off the J_KEY on the receive side */
+       write_kctxt_csr(dd, ctxt, RCV_KEY_CTRL, 0);
+done:
+       return ret;
+}
+
+int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt, u16 pkey)
+{
+       struct hfi1_ctxtdata *rcd;
+       unsigned sctxt;
+       int ret = 0;
+       u64 reg;
+
+       if (ctxt < dd->num_rcv_contexts)
+               rcd = dd->rcd[ctxt];
+       else {
+               ret = -EINVAL;
+               goto done;
+       }
+       if (!rcd || !rcd->sc) {
+               ret = -EINVAL;
+               goto done;
+       }
+       sctxt = rcd->sc->hw_context;
+       reg = ((u64)pkey & SEND_CTXT_CHECK_PARTITION_KEY_VALUE_MASK) <<
+               SEND_CTXT_CHECK_PARTITION_KEY_VALUE_SHIFT;
+       write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_PARTITION_KEY, reg);
+       reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
+       reg |= SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK;
+       write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
+done:
+       return ret;
+}
+
+int hfi1_clear_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt)
+{
+       struct hfi1_ctxtdata *rcd;
+       unsigned sctxt;
+       int ret = 0;
+       u64 reg;
+
+       if (ctxt < dd->num_rcv_contexts)
+               rcd = dd->rcd[ctxt];
+       else {
+               ret = -EINVAL;
+               goto done;
+       }
+       if (!rcd || !rcd->sc) {
+               ret = -EINVAL;
+               goto done;
+       }
+       sctxt = rcd->sc->hw_context;
+       reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE);
+       reg &= ~SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK;
+       write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg);
+       write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_PARTITION_KEY, 0);
+done:
+       return ret;
+}
+
+/*
+ * Start doing the clean up the the chip. Our clean up happens in multiple
+ * stages and this is just the first.
+ */
+void hfi1_start_cleanup(struct hfi1_devdata *dd)
+{
+       free_cntrs(dd);
+       free_rcverr(dd);
+       clean_up_interrupts(dd);
+}
+
+#define HFI_BASE_GUID(dev) \
+       ((dev)->base_guid & ~(1ULL << GUID_HFI_INDEX_SHIFT))
+
+/*
+ * Certain chip functions need to be initialized only once per asic
+ * instead of per-device. This function finds the peer device and
+ * checks whether that chip initialization needs to be done by this
+ * device.
+ */
+static void asic_should_init(struct hfi1_devdata *dd)
+{
+       unsigned long flags;
+       struct hfi1_devdata *tmp, *peer = NULL;
+
+       spin_lock_irqsave(&hfi1_devs_lock, flags);
+       /* Find our peer device */
+       list_for_each_entry(tmp, &hfi1_dev_list, list) {
+               if ((HFI_BASE_GUID(dd) == HFI_BASE_GUID(tmp)) &&
+                   dd->unit != tmp->unit) {
+                       peer = tmp;
+                       break;
+               }
+       }
+
+       /*
+        * "Claim" the ASIC for initialization if it hasn't been
+        " "claimed" yet.
+        */
+       if (!peer || !(peer->flags & HFI1_DO_INIT_ASIC))
+               dd->flags |= HFI1_DO_INIT_ASIC;
+       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+}
+
+/**
+ * Allocate and initialize the device structure for the hfi.
+ * @dev: the pci_dev for hfi1_ib device
+ * @ent: pci_device_id struct for this dev
+ *
+ * Also allocates, initializes, and returns the devdata struct for this
+ * device instance
+ *
+ * This is global, and is called directly at init to set up the
+ * chip-specific function pointers for later use.
+ */
+struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
+                                 const struct pci_device_id *ent)
+{
+       struct hfi1_devdata *dd;
+       struct hfi1_pportdata *ppd;
+       u64 reg;
+       int i, ret;
+       static const char * const inames[] = { /* implementation names */
+               "RTL silicon",
+               "RTL VCS simulation",
+               "RTL FPGA emulation",
+               "Functional simulator"
+       };
+
+       dd = hfi1_alloc_devdata(pdev,
+               NUM_IB_PORTS * sizeof(struct hfi1_pportdata));
+       if (IS_ERR(dd))
+               goto bail;
+       ppd = dd->pport;
+       for (i = 0; i < dd->num_pports; i++, ppd++) {
+               int vl;
+               /* init common fields */
+               hfi1_init_pportdata(pdev, ppd, dd, 0, 1);
+               /* DC supports 4 link widths */
+               ppd->link_width_supported =
+                       OPA_LINK_WIDTH_1X | OPA_LINK_WIDTH_2X |
+                       OPA_LINK_WIDTH_3X | OPA_LINK_WIDTH_4X;
+               ppd->link_width_downgrade_supported =
+                       ppd->link_width_supported;
+               /* start out enabling only 4X */
+               ppd->link_width_enabled = OPA_LINK_WIDTH_4X;
+               ppd->link_width_downgrade_enabled =
+                                       ppd->link_width_downgrade_supported;
+               /* link width active is 0 when link is down */
+               /* link width downgrade active is 0 when link is down */
+
+               if (num_vls < HFI1_MIN_VLS_SUPPORTED
+                       || num_vls > HFI1_MAX_VLS_SUPPORTED) {
+                       hfi1_early_err(&pdev->dev,
+                                      "Invalid num_vls %u, using %u VLs\n",
+                                   num_vls, HFI1_MAX_VLS_SUPPORTED);
+                       num_vls = HFI1_MAX_VLS_SUPPORTED;
+               }
+               ppd->vls_supported = num_vls;
+               ppd->vls_operational = ppd->vls_supported;
+               /* Set the default MTU. */
+               for (vl = 0; vl < num_vls; vl++)
+                       dd->vld[vl].mtu = hfi1_max_mtu;
+               dd->vld[15].mtu = MAX_MAD_PACKET;
+               /*
+                * Set the initial values to reasonable default, will be set
+                * for real when link is up.
+                */
+               ppd->lstate = IB_PORT_DOWN;
+               ppd->overrun_threshold = 0x4;
+               ppd->phy_error_threshold = 0xf;
+               ppd->port_crc_mode_enabled = link_crc_mask;
+               /* initialize supported LTP CRC mode */
+               ppd->port_ltp_crc_mode = cap_to_port_ltp(link_crc_mask) << 8;
+               /* initialize enabled LTP CRC mode */
+               ppd->port_ltp_crc_mode |= cap_to_port_ltp(link_crc_mask) << 4;
+               /* start in offline */
+               ppd->host_link_state = HLS_DN_OFFLINE;
+               init_vl_arb_caches(ppd);
+       }
+
+       dd->link_default = HLS_DN_POLL;
+
+       /*
+        * Do remaining PCIe setup and save PCIe values in dd.
+        * Any error printing is already done by the init code.
+        * On return, we have the chip mapped.
+        */
+       ret = hfi1_pcie_ddinit(dd, pdev, ent);
+       if (ret < 0)
+               goto bail_free;
+
+       /* verify that reads actually work, save revision for reset check */
+       dd->revision = read_csr(dd, CCE_REVISION);
+       if (dd->revision == ~(u64)0) {
+               dd_dev_err(dd, "cannot read chip CSRs\n");
+               ret = -EINVAL;
+               goto bail_cleanup;
+       }
+       dd->majrev = (dd->revision >> CCE_REVISION_CHIP_REV_MAJOR_SHIFT)
+                       & CCE_REVISION_CHIP_REV_MAJOR_MASK;
+       dd->minrev = (dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT)
+                       & CCE_REVISION_CHIP_REV_MINOR_MASK;
+
+       /* obtain the hardware ID - NOT related to unit, which is a
+          software enumeration */
+       reg = read_csr(dd, CCE_REVISION2);
+       dd->hfi1_id = (reg >> CCE_REVISION2_HFI_ID_SHIFT)
+                                       & CCE_REVISION2_HFI_ID_MASK;
+       /* the variable size will remove unwanted bits */
+       dd->icode = reg >> CCE_REVISION2_IMPL_CODE_SHIFT;
+       dd->irev = reg >> CCE_REVISION2_IMPL_REVISION_SHIFT;
+       dd_dev_info(dd, "Implementation: %s, revision 0x%x\n",
+               dd->icode < ARRAY_SIZE(inames) ? inames[dd->icode] : "unknown",
+               (int)dd->irev);
+
+       /* speeds the hardware can support */
+       dd->pport->link_speed_supported = OPA_LINK_SPEED_25G;
+       /* speeds allowed to run at */
+       dd->pport->link_speed_enabled = dd->pport->link_speed_supported;
+       /* give a reasonable active value, will be set on link up */
+       dd->pport->link_speed_active = OPA_LINK_SPEED_25G;
+
+       dd->chip_rcv_contexts = read_csr(dd, RCV_CONTEXTS);
+       dd->chip_send_contexts = read_csr(dd, SEND_CONTEXTS);
+       dd->chip_sdma_engines = read_csr(dd, SEND_DMA_ENGINES);
+       dd->chip_pio_mem_size = read_csr(dd, SEND_PIO_MEM_SIZE);
+       dd->chip_sdma_mem_size = read_csr(dd, SEND_DMA_MEM_SIZE);
+       /* fix up link widths for emulation _p */
+       ppd = dd->pport;
+       if (dd->icode == ICODE_FPGA_EMULATION && is_emulator_p(dd)) {
+               ppd->link_width_supported =
+                       ppd->link_width_enabled =
+                       ppd->link_width_downgrade_supported =
+                       ppd->link_width_downgrade_enabled =
+                               OPA_LINK_WIDTH_1X;
+       }
+       /* insure num_vls isn't larger than number of sdma engines */
+       if (HFI1_CAP_IS_KSET(SDMA) && num_vls > dd->chip_sdma_engines) {
+               dd_dev_err(dd, "num_vls %u too large, using %u VLs\n",
+                               num_vls, HFI1_MAX_VLS_SUPPORTED);
+               ppd->vls_supported = num_vls = HFI1_MAX_VLS_SUPPORTED;
+               ppd->vls_operational = ppd->vls_supported;
+       }
+
+       /*
+        * Convert the ns parameter to the 64 * cclocks used in the CSR.
+        * Limit the max if larger than the field holds.  If timeout is
+        * non-zero, then the calculated field will be at least 1.
+        *
+        * Must be after icode is set up - the cclock rate depends
+        * on knowing the hardware being used.
+        */
+       dd->rcv_intr_timeout_csr = ns_to_cclock(dd, rcv_intr_timeout) / 64;
+       if (dd->rcv_intr_timeout_csr >
+                       RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK)
+               dd->rcv_intr_timeout_csr =
+                       RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK;
+       else if (dd->rcv_intr_timeout_csr == 0 && rcv_intr_timeout)
+               dd->rcv_intr_timeout_csr = 1;
+
+       /* needs to be done before we look for the peer device */
+       read_guid(dd);
+
+       /* should this device init the ASIC block? */
+       asic_should_init(dd);
+
+       /* obtain chip sizes, reset chip CSRs */
+       init_chip(dd);
+
+       /* read in the PCIe link speed information */
+       ret = pcie_speeds(dd);
+       if (ret)
+               goto bail_cleanup;
+
+       /* read in firmware */
+       ret = hfi1_firmware_init(dd);
+       if (ret)
+               goto bail_cleanup;
+
+       /*
+        * In general, the PCIe Gen3 transition must occur after the
+        * chip has been idled (so it won't initiate any PCIe transactions
+        * e.g. an interrupt) and before the driver changes any registers
+        * (the transition will reset the registers).
+        *
+        * In particular, place this call after:
+        * - init_chip()     - the chip will not initiate any PCIe transactions
+        * - pcie_speeds()   - reads the current link speed
+        * - hfi1_firmware_init() - the needed firmware is ready to be
+        *                          downloaded
+        */
+       ret = do_pcie_gen3_transition(dd);
+       if (ret)
+               goto bail_cleanup;
+
+       /* start setting dd values and adjusting CSRs */
+       init_early_variables(dd);
+
+       parse_platform_config(dd);
+
+       /* add board names as they are defined */
+       dd->boardname = kmalloc(64, GFP_KERNEL);
+       if (!dd->boardname)
+               goto bail_cleanup;
+       snprintf(dd->boardname, 64, "Board ID 0x%llx",
+                dd->revision >> CCE_REVISION_BOARD_ID_LOWER_NIBBLE_SHIFT
+                   & CCE_REVISION_BOARD_ID_LOWER_NIBBLE_MASK);
+
+       snprintf(dd->boardversion, BOARD_VERS_MAX,
+                "ChipABI %u.%u, %s, ChipRev %u.%u, SW Compat %llu\n",
+                HFI1_CHIP_VERS_MAJ, HFI1_CHIP_VERS_MIN,
+                dd->boardname,
+                (u32)dd->majrev,
+                (u32)dd->minrev,
+                (dd->revision >> CCE_REVISION_SW_SHIFT)
+                   & CCE_REVISION_SW_MASK);
+
+       ret = set_up_context_variables(dd);
+       if (ret)
+               goto bail_cleanup;
+
+       /* set initial RXE CSRs */
+       init_rxe(dd);
+       /* set initial TXE CSRs */
+       init_txe(dd);
+       /* set initial non-RXE, non-TXE CSRs */
+       init_other(dd);
+       /* set up KDETH QP prefix in both RX and TX CSRs */
+       init_kdeth_qp(dd);
+
+       /* send contexts must be set up before receive contexts */
+       ret = init_send_contexts(dd);
+       if (ret)
+               goto bail_cleanup;
+
+       ret = hfi1_create_ctxts(dd);
+       if (ret)
+               goto bail_cleanup;
+
+       dd->rcvhdrsize = DEFAULT_RCVHDRSIZE;
+       /*
+        * rcd[0] is guaranteed to be valid by this point. Also, all
+        * context are using the same value, as per the module parameter.
+        */
+       dd->rhf_offset = dd->rcd[0]->rcvhdrqentsize - sizeof(u64) / sizeof(u32);
+
+       ret = init_pervl_scs(dd);
+       if (ret)
+               goto bail_cleanup;
+
+       /* sdma init */
+       for (i = 0; i < dd->num_pports; ++i) {
+               ret = sdma_init(dd, i);
+               if (ret)
+                       goto bail_cleanup;
+       }
+
+       /* use contexts created by hfi1_create_ctxts */
+       ret = set_up_interrupts(dd);
+       if (ret)
+               goto bail_cleanup;
+
+       /* set up LCB access - must be after set_up_interrupts() */
+       init_lcb_access(dd);
+
+       snprintf(dd->serial, SERIAL_MAX, "0x%08llx\n",
+                dd->base_guid & 0xFFFFFF);
+
+       dd->oui1 = dd->base_guid >> 56 & 0xFF;
+       dd->oui2 = dd->base_guid >> 48 & 0xFF;
+       dd->oui3 = dd->base_guid >> 40 & 0xFF;
+
+       ret = load_firmware(dd); /* asymmetric with dispose_firmware() */
+       if (ret)
+               goto bail_clear_intr;
+       check_fabric_firmware_versions(dd);
+
+       thermal_init(dd);
+
+       ret = init_cntrs(dd);
+       if (ret)
+               goto bail_clear_intr;
+
+       ret = init_rcverr(dd);
+       if (ret)
+               goto bail_free_cntrs;
+
+       ret = eprom_init(dd);
+       if (ret)
+               goto bail_free_rcverr;
+
+       goto bail;
+
+bail_free_rcverr:
+       free_rcverr(dd);
+bail_free_cntrs:
+       free_cntrs(dd);
+bail_clear_intr:
+       clean_up_interrupts(dd);
+bail_cleanup:
+       hfi1_pcie_ddcleanup(dd);
+bail_free:
+       hfi1_free_devdata(dd);
+       dd = ERR_PTR(ret);
+bail:
+       return dd;
+}
+
+static u16 delay_cycles(struct hfi1_pportdata *ppd, u32 desired_egress_rate,
+                       u32 dw_len)
+{
+       u32 delta_cycles;
+       u32 current_egress_rate = ppd->current_egress_rate;
+       /* rates here are in units of 10^6 bits/sec */
+
+       if (desired_egress_rate == -1)
+               return 0; /* shouldn't happen */
+
+       if (desired_egress_rate >= current_egress_rate)
+               return 0; /* we can't help go faster, only slower */
+
+       delta_cycles = egress_cycles(dw_len * 4, desired_egress_rate) -
+                       egress_cycles(dw_len * 4, current_egress_rate);
+
+       return (u16)delta_cycles;
+}
+
+
+/**
+ * create_pbc - build a pbc for transmission
+ * @flags: special case flags or-ed in built pbc
+ * @srate: static rate
+ * @vl: vl
+ * @dwlen: dword length (header words + data words + pbc words)
+ *
+ * Create a PBC with the given flags, rate, VL, and length.
+ *
+ * NOTE: The PBC created will not insert any HCRC - all callers but one are
+ * for verbs, which does not use this PSM feature.  The lone other caller
+ * is for the diagnostic interface which calls this if the user does not
+ * supply their own PBC.
+ */
+u64 create_pbc(struct hfi1_pportdata *ppd, u64 flags, int srate_mbs, u32 vl,
+              u32 dw_len)
+{
+       u64 pbc, delay = 0;
+
+       if (unlikely(srate_mbs))
+               delay = delay_cycles(ppd, srate_mbs, dw_len);
+
+       pbc = flags
+               | (delay << PBC_STATIC_RATE_CONTROL_COUNT_SHIFT)
+               | ((u64)PBC_IHCRC_NONE << PBC_INSERT_HCRC_SHIFT)
+               | (vl & PBC_VL_MASK) << PBC_VL_SHIFT
+               | (dw_len & PBC_LENGTH_DWS_MASK)
+                       << PBC_LENGTH_DWS_SHIFT;
+
+       return pbc;
+}
+
+#define SBUS_THERMAL    0x4f
+#define SBUS_THERM_MONITOR_MODE 0x1
+
+#define THERM_FAILURE(dev, ret, reason) \
+       dd_dev_err((dd),                                                \
+                  "Thermal sensor initialization failed: %s (%d)\n",   \
+                  (reason), (ret))
+
+/*
+ * Initialize the Avago Thermal sensor.
+ *
+ * After initialization, enable polling of thermal sensor through
+ * SBus interface. In order for this to work, the SBus Master
+ * firmware has to be loaded due to the fact that the HW polling
+ * logic uses SBus interrupts, which are not supported with
+ * default firmware. Otherwise, no data will be returned through
+ * the ASIC_STS_THERM CSR.
+ */
+static int thermal_init(struct hfi1_devdata *dd)
+{
+       int ret = 0;
+
+       if (dd->icode != ICODE_RTL_SILICON ||
+           !(dd->flags & HFI1_DO_INIT_ASIC))
+               return ret;
+
+       acquire_hw_mutex(dd);
+       dd_dev_info(dd, "Initializing thermal sensor\n");
+
+       /* Thermal Sensor Initialization */
+       /*    Step 1: Reset the Thermal SBus Receiver */
+       ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0,
+                               RESET_SBUS_RECEIVER, 0);
+       if (ret) {
+               THERM_FAILURE(dd, ret, "Bus Reset");
+               goto done;
+       }
+       /*    Step 2: Set Reset bit in Thermal block */
+       ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0,
+                               WRITE_SBUS_RECEIVER, 0x1);
+       if (ret) {
+               THERM_FAILURE(dd, ret, "Therm Block Reset");
+               goto done;
+       }
+       /*    Step 3: Write clock divider value (100MHz -> 2MHz) */
+       ret = sbus_request_slow(dd, SBUS_THERMAL, 0x1,
+                               WRITE_SBUS_RECEIVER, 0x32);
+       if (ret) {
+               THERM_FAILURE(dd, ret, "Write Clock Div");
+               goto done;
+       }
+       /*    Step 4: Select temperature mode */
+       ret = sbus_request_slow(dd, SBUS_THERMAL, 0x3,
+                               WRITE_SBUS_RECEIVER,
+                               SBUS_THERM_MONITOR_MODE);
+       if (ret) {
+               THERM_FAILURE(dd, ret, "Write Mode Sel");
+               goto done;
+       }
+       /*    Step 5: De-assert block reset and start conversion */
+       ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0,
+                               WRITE_SBUS_RECEIVER, 0x2);
+       if (ret) {
+               THERM_FAILURE(dd, ret, "Write Reset Deassert");
+               goto done;
+       }
+       /*    Step 5.1: Wait for first conversion (21.5ms per spec) */
+       msleep(22);
+
+       /* Enable polling of thermal readings */
+       write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x1);
+done:
+       release_hw_mutex(dd);
+       return ret;
+}
+
+static void handle_temp_err(struct hfi1_devdata *dd)
+{
+       struct hfi1_pportdata *ppd = &dd->pport[0];
+       /*
+        * Thermal Critical Interrupt
+        * Put the device into forced freeze mode, take link down to
+        * offline, and put DC into reset.
+        */
+       dd_dev_emerg(dd,
+                    "Critical temperature reached! Forcing device into freeze mode!\n");
+       dd->flags |= HFI1_FORCED_FREEZE;
+       start_freeze_handling(ppd, FREEZE_SELF|FREEZE_ABORT);
+       /*
+        * Shut DC down as much and as quickly as possible.
+        *
+        * Step 1: Take the link down to OFFLINE. This will cause the
+        *         8051 to put the Serdes in reset. However, we don't want to
+        *         go through the entire link state machine since we want to
+        *         shutdown ASAP. Furthermore, this is not a graceful shutdown
+        *         but rather an attempt to save the chip.
+        *         Code below is almost the same as quiet_serdes() but avoids
+        *         all the extra work and the sleeps.
+        */
+       ppd->driver_link_ready = 0;
+       ppd->link_enabled = 0;
+       set_physical_link_state(dd, PLS_OFFLINE |
+                               (OPA_LINKDOWN_REASON_SMA_DISABLED << 8));
+       /*
+        * Step 2: Shutdown LCB and 8051
+        *         After shutdown, do not restore DC_CFG_RESET value.
+        */
+       dc_shutdown(dd);
+}