These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / net / rds / ib_send.c
index bd3825d..eac30bf 100644 (file)
 #include "rds.h"
 #include "ib.h"
 
-static char *rds_ib_wc_status_strings[] = {
-#define RDS_IB_WC_STATUS_STR(foo) \
-               [IB_WC_##foo] = __stringify(IB_WC_##foo)
-       RDS_IB_WC_STATUS_STR(SUCCESS),
-       RDS_IB_WC_STATUS_STR(LOC_LEN_ERR),
-       RDS_IB_WC_STATUS_STR(LOC_QP_OP_ERR),
-       RDS_IB_WC_STATUS_STR(LOC_EEC_OP_ERR),
-       RDS_IB_WC_STATUS_STR(LOC_PROT_ERR),
-       RDS_IB_WC_STATUS_STR(WR_FLUSH_ERR),
-       RDS_IB_WC_STATUS_STR(MW_BIND_ERR),
-       RDS_IB_WC_STATUS_STR(BAD_RESP_ERR),
-       RDS_IB_WC_STATUS_STR(LOC_ACCESS_ERR),
-       RDS_IB_WC_STATUS_STR(REM_INV_REQ_ERR),
-       RDS_IB_WC_STATUS_STR(REM_ACCESS_ERR),
-       RDS_IB_WC_STATUS_STR(REM_OP_ERR),
-       RDS_IB_WC_STATUS_STR(RETRY_EXC_ERR),
-       RDS_IB_WC_STATUS_STR(RNR_RETRY_EXC_ERR),
-       RDS_IB_WC_STATUS_STR(LOC_RDD_VIOL_ERR),
-       RDS_IB_WC_STATUS_STR(REM_INV_RD_REQ_ERR),
-       RDS_IB_WC_STATUS_STR(REM_ABORT_ERR),
-       RDS_IB_WC_STATUS_STR(INV_EECN_ERR),
-       RDS_IB_WC_STATUS_STR(INV_EEC_STATE_ERR),
-       RDS_IB_WC_STATUS_STR(FATAL_ERR),
-       RDS_IB_WC_STATUS_STR(RESP_TIMEOUT_ERR),
-       RDS_IB_WC_STATUS_STR(GENERAL_ERR),
-#undef RDS_IB_WC_STATUS_STR
-};
-
-char *rds_ib_wc_status_str(enum ib_wc_status status)
-{
-       return rds_str_array(rds_ib_wc_status_strings,
-                            ARRAY_SIZE(rds_ib_wc_status_strings), status);
-}
-
 /*
  * Convert IB-specific error message to RDS error message and call core
  * completion handler.
@@ -229,16 +195,16 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic)
 
                send->s_op = NULL;
 
-               send->s_wr.wr_id = i;
+               send->s_wr.wr_id = i | RDS_IB_SEND_OP;
                send->s_wr.sg_list = send->s_sge;
                send->s_wr.ex.imm_data = 0;
 
                sge = &send->s_sge[0];
                sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
                sge->length = sizeof(struct rds_header);
-               sge->lkey = ic->i_mr->lkey;
+               sge->lkey = ic->i_pd->local_dma_lkey;
 
-               send->s_sge[1].lkey = ic->i_mr->lkey;
+               send->s_sge[1].lkey = ic->i_pd->local_dma_lkey;
        }
 }
 
@@ -271,81 +237,73 @@ static void rds_ib_sub_signaled(struct rds_ib_connection *ic, int nr)
  * unallocs the next free entry in the ring it doesn't alter which is
  * the next to be freed, which is what this is concerned with.
  */
-void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
+void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
 {
-       struct rds_connection *conn = context;
-       struct rds_ib_connection *ic = conn->c_transport_data;
        struct rds_message *rm = NULL;
-       struct ib_wc wc;
+       struct rds_connection *conn = ic->conn;
        struct rds_ib_send_work *send;
        u32 completed;
        u32 oldest;
        u32 i = 0;
-       int ret;
        int nr_sig = 0;
 
-       rdsdebug("cq %p conn %p\n", cq, conn);
-       rds_ib_stats_inc(s_ib_tx_cq_call);
-       ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
-       if (ret)
-               rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
-
-       while (ib_poll_cq(cq, 1, &wc) > 0) {
-               rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
-                        (unsigned long long)wc.wr_id, wc.status,
-                        rds_ib_wc_status_str(wc.status), wc.byte_len,
-                        be32_to_cpu(wc.ex.imm_data));
-               rds_ib_stats_inc(s_ib_tx_cq_event);
-
-               if (wc.wr_id == RDS_IB_ACK_WR_ID) {
-                       if (time_after(jiffies, ic->i_ack_queued + HZ/2))
-                               rds_ib_stats_inc(s_ib_tx_stalled);
-                       rds_ib_ack_send_complete(ic);
-                       continue;
-               }
 
-               oldest = rds_ib_ring_oldest(&ic->i_send_ring);
+       rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
+                (unsigned long long)wc->wr_id, wc->status,
+                ib_wc_status_msg(wc->status), wc->byte_len,
+                be32_to_cpu(wc->ex.imm_data));
+       rds_ib_stats_inc(s_ib_tx_cq_event);
 
-               completed = rds_ib_ring_completed(&ic->i_send_ring, wc.wr_id, oldest);
+       if (wc->wr_id == RDS_IB_ACK_WR_ID) {
+               if (time_after(jiffies, ic->i_ack_queued + HZ / 2))
+                       rds_ib_stats_inc(s_ib_tx_stalled);
+               rds_ib_ack_send_complete(ic);
+               return;
+       }
 
-               for (i = 0; i < completed; i++) {
-                       send = &ic->i_sends[oldest];
-                       if (send->s_wr.send_flags & IB_SEND_SIGNALED)
-                               nr_sig++;
+       oldest = rds_ib_ring_oldest(&ic->i_send_ring);
 
-                       rm = rds_ib_send_unmap_op(ic, send, wc.status);
+       completed = rds_ib_ring_completed(&ic->i_send_ring,
+                                         (wc->wr_id & ~RDS_IB_SEND_OP),
+                                         oldest);
 
-                       if (time_after(jiffies, send->s_queued + HZ/2))
-                               rds_ib_stats_inc(s_ib_tx_stalled);
+       for (i = 0; i < completed; i++) {
+               send = &ic->i_sends[oldest];
+               if (send->s_wr.send_flags & IB_SEND_SIGNALED)
+                       nr_sig++;
 
-                       if (send->s_op) {
-                               if (send->s_op == rm->m_final_op) {
-                                       /* If anyone waited for this message to get flushed out, wake
-                                        * them up now */
-                                       rds_message_unmapped(rm);
-                               }
-                               rds_message_put(rm);
-                               send->s_op = NULL;
-                       }
+               rm = rds_ib_send_unmap_op(ic, send, wc->status);
 
-                       oldest = (oldest + 1) % ic->i_send_ring.w_nr;
-               }
+               if (time_after(jiffies, send->s_queued + HZ / 2))
+                       rds_ib_stats_inc(s_ib_tx_stalled);
 
-               rds_ib_ring_free(&ic->i_send_ring, completed);
-               rds_ib_sub_signaled(ic, nr_sig);
-               nr_sig = 0;
-
-               if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
-                   test_bit(0, &conn->c_map_queued))
-                       queue_delayed_work(rds_wq, &conn->c_send_w, 0);
-
-               /* We expect errors as the qp is drained during shutdown */
-               if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) {
-                       rds_ib_conn_error(conn, "send completion on %pI4 had status "
-                                         "%u (%s), disconnecting and reconnecting\n",
-                                         &conn->c_faddr, wc.status,
-                                         rds_ib_wc_status_str(wc.status));
+               if (send->s_op) {
+                       if (send->s_op == rm->m_final_op) {
+                               /* If anyone waited for this message to get
+                                * flushed out, wake them up now
+                                */
+                               rds_message_unmapped(rm);
+                       }
+                       rds_message_put(rm);
+                       send->s_op = NULL;
                }
+
+               oldest = (oldest + 1) % ic->i_send_ring.w_nr;
+       }
+
+       rds_ib_ring_free(&ic->i_send_ring, completed);
+       rds_ib_sub_signaled(ic, nr_sig);
+       nr_sig = 0;
+
+       if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
+           test_bit(0, &conn->c_map_queued))
+               queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+
+       /* We expect errors as the qp is drained during shutdown */
+       if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) {
+               rds_ib_conn_error(conn, "send completion on %pI4 had status %u (%s), disconnecting and reconnecting\n",
+                                 &conn->c_faddr, wc->status,
+                                 ib_wc_status_msg(wc->status));
        }
 }
 
@@ -605,6 +563,8 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
                }
 
                rds_message_addref(rm);
+               rm->data.op_dmasg = 0;
+               rm->data.op_dmaoff = 0;
                ic->i_data_op = &rm->data;
 
                /* Finalize the header */
@@ -658,7 +618,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
        send = &ic->i_sends[pos];
        first = send;
        prev = NULL;
-       scat = &ic->i_data_op->op_sg[sg];
+       scat = &ic->i_data_op->op_sg[rm->data.op_dmasg];
        i = 0;
        do {
                unsigned int len = 0;
@@ -680,17 +640,20 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
                /* Set up the data, if present */
                if (i < work_alloc
                    && scat != &rm->data.op_sg[rm->data.op_count]) {
-                       len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off);
+                       len = min(RDS_FRAG_SIZE,
+                               ib_sg_dma_len(dev, scat) - rm->data.op_dmaoff);
                        send->s_wr.num_sge = 2;
 
-                       send->s_sge[1].addr = ib_sg_dma_address(dev, scat) + off;
+                       send->s_sge[1].addr = ib_sg_dma_address(dev, scat);
+                       send->s_sge[1].addr += rm->data.op_dmaoff;
                        send->s_sge[1].length = len;
 
                        bytes_sent += len;
-                       off += len;
-                       if (off == ib_sg_dma_len(dev, scat)) {
+                       rm->data.op_dmaoff += len;
+                       if (rm->data.op_dmaoff == ib_sg_dma_len(dev, scat)) {
                                scat++;
-                               off = 0;
+                               rm->data.op_dmasg++;
+                               rm->data.op_dmaoff = 0;
                        }
                }
 
@@ -738,6 +701,11 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
        if (scat == &rm->data.op_sg[rm->data.op_count]) {
                prev->s_op = ic->i_data_op;
                prev->s_wr.send_flags |= IB_SEND_SOLICITED;
+               if (!(prev->s_wr.send_flags & IB_SEND_SIGNALED)) {
+                       ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
+                       prev->s_wr.send_flags |= IB_SEND_SIGNALED;
+                       nr_sig++;
+               }
                ic->i_data_op = NULL;
        }
 
@@ -809,23 +777,23 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
        send->s_queued = jiffies;
 
        if (op->op_type == RDS_ATOMIC_TYPE_CSWP) {
-               send->s_wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP;
-               send->s_wr.wr.atomic.compare_add = op->op_m_cswp.compare;
-               send->s_wr.wr.atomic.swap = op->op_m_cswp.swap;
-               send->s_wr.wr.atomic.compare_add_mask = op->op_m_cswp.compare_mask;
-               send->s_wr.wr.atomic.swap_mask = op->op_m_cswp.swap_mask;
+               send->s_atomic_wr.wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP;
+               send->s_atomic_wr.compare_add = op->op_m_cswp.compare;
+               send->s_atomic_wr.swap = op->op_m_cswp.swap;
+               send->s_atomic_wr.compare_add_mask = op->op_m_cswp.compare_mask;
+               send->s_atomic_wr.swap_mask = op->op_m_cswp.swap_mask;
        } else { /* FADD */
-               send->s_wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD;
-               send->s_wr.wr.atomic.compare_add = op->op_m_fadd.add;
-               send->s_wr.wr.atomic.swap = 0;
-               send->s_wr.wr.atomic.compare_add_mask = op->op_m_fadd.nocarry_mask;
-               send->s_wr.wr.atomic.swap_mask = 0;
+               send->s_atomic_wr.wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD;
+               send->s_atomic_wr.compare_add = op->op_m_fadd.add;
+               send->s_atomic_wr.swap = 0;
+               send->s_atomic_wr.compare_add_mask = op->op_m_fadd.nocarry_mask;
+               send->s_atomic_wr.swap_mask = 0;
        }
        nr_sig = rds_ib_set_wr_signal_state(ic, send, op->op_notify);
-       send->s_wr.num_sge = 1;
-       send->s_wr.next = NULL;
-       send->s_wr.wr.atomic.remote_addr = op->op_remote_addr;
-       send->s_wr.wr.atomic.rkey = op->op_rkey;
+       send->s_atomic_wr.wr.num_sge = 1;
+       send->s_atomic_wr.wr.next = NULL;
+       send->s_atomic_wr.remote_addr = op->op_remote_addr;
+       send->s_atomic_wr.rkey = op->op_rkey;
        send->s_op = op;
        rds_message_addref(container_of(send->s_op, struct rds_message, atomic));
 
@@ -842,7 +810,7 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
        /* Convert our struct scatterlist to struct ib_sge */
        send->s_sge[0].addr = ib_sg_dma_address(ic->i_cm_id->device, op->op_sg);
        send->s_sge[0].length = ib_sg_dma_len(ic->i_cm_id->device, op->op_sg);
-       send->s_sge[0].lkey = ic->i_mr->lkey;
+       send->s_sge[0].lkey = ic->i_pd->local_dma_lkey;
 
        rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr,
                 send->s_sge[0].addr, send->s_sge[0].length);
@@ -850,11 +818,11 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
        if (nr_sig)
                atomic_add(nr_sig, &ic->i_signaled_sends);
 
-       failed_wr = &send->s_wr;
-       ret = ib_post_send(ic->i_cm_id->qp, &send->s_wr, &failed_wr);
+       failed_wr = &send->s_atomic_wr.wr;
+       ret = ib_post_send(ic->i_cm_id->qp, &send->s_atomic_wr.wr, &failed_wr);
        rdsdebug("ic %p send %p (wr %p) ret %d wr %p\n", ic,
-                send, &send->s_wr, ret, failed_wr);
-       BUG_ON(failed_wr != &send->s_wr);
+                send, &send->s_atomic_wr, ret, failed_wr);
+       BUG_ON(failed_wr != &send->s_atomic_wr.wr);
        if (ret) {
                printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 "
                       "returned %d\n", &conn->c_faddr, ret);
@@ -863,9 +831,9 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
                goto out;
        }
 
-       if (unlikely(failed_wr != &send->s_wr)) {
+       if (unlikely(failed_wr != &send->s_atomic_wr.wr)) {
                printk(KERN_WARNING "RDS/IB: atomic ib_post_send() rc=%d, but failed_wqe updated!\n", ret);
-               BUG_ON(failed_wr != &send->s_wr);
+               BUG_ON(failed_wr != &send->s_atomic_wr.wr);
        }
 
 out:
@@ -936,27 +904,28 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
                nr_sig += rds_ib_set_wr_signal_state(ic, send, op->op_notify);
 
                send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
-               send->s_wr.wr.rdma.remote_addr = remote_addr;
-               send->s_wr.wr.rdma.rkey = op->op_rkey;
+               send->s_rdma_wr.remote_addr = remote_addr;
+               send->s_rdma_wr.rkey = op->op_rkey;
 
                if (num_sge > max_sge) {
-                       send->s_wr.num_sge = max_sge;
+                       send->s_rdma_wr.wr.num_sge = max_sge;
                        num_sge -= max_sge;
                } else {
-                       send->s_wr.num_sge = num_sge;
+                       send->s_rdma_wr.wr.num_sge = num_sge;
                }
 
-               send->s_wr.next = NULL;
+               send->s_rdma_wr.wr.next = NULL;
 
                if (prev)
-                       prev->s_wr.next = &send->s_wr;
+                       prev->s_rdma_wr.wr.next = &send->s_rdma_wr.wr;
 
-               for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) {
+               for (j = 0; j < send->s_rdma_wr.wr.num_sge &&
+                    scat != &op->op_sg[op->op_count]; j++) {
                        len = ib_sg_dma_len(ic->i_cm_id->device, scat);
                        send->s_sge[j].addr =
                                 ib_sg_dma_address(ic->i_cm_id->device, scat);
                        send->s_sge[j].length = len;
-                       send->s_sge[j].lkey = ic->i_mr->lkey;
+                       send->s_sge[j].lkey = ic->i_pd->local_dma_lkey;
 
                        sent += len;
                        rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr);
@@ -966,7 +935,9 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
                }
 
                rdsdebug("send %p wr %p num_sge %u next %p\n", send,
-                       &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
+                       &send->s_rdma_wr.wr,
+                       send->s_rdma_wr.wr.num_sge,
+                       send->s_rdma_wr.wr.next);
 
                prev = send;
                if (++send == &ic->i_sends[ic->i_send_ring.w_nr])
@@ -987,11 +958,11 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
        if (nr_sig)
                atomic_add(nr_sig, &ic->i_signaled_sends);
 
-       failed_wr = &first->s_wr;
-       ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
+       failed_wr = &first->s_rdma_wr.wr;
+       ret = ib_post_send(ic->i_cm_id->qp, &first->s_rdma_wr.wr, &failed_wr);
        rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
-                first, &first->s_wr, ret, failed_wr);
-       BUG_ON(failed_wr != &first->s_wr);
+                first, &first->s_rdma_wr.wr, ret, failed_wr);
+       BUG_ON(failed_wr != &first->s_rdma_wr.wr);
        if (ret) {
                printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 "
                       "returned %d\n", &conn->c_faddr, ret);
@@ -1000,9 +971,9 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
                goto out;
        }
 
-       if (unlikely(failed_wr != &first->s_wr)) {
+       if (unlikely(failed_wr != &first->s_rdma_wr.wr)) {
                printk(KERN_WARNING "RDS/IB: ib_post_send() rc=%d, but failed_wqe updated!\n", ret);
-               BUG_ON(failed_wr != &first->s_wr);
+               BUG_ON(failed_wr != &first->s_rdma_wr.wr);
        }