Merge "PROX generator: performance optimization (3/4)"

[samplevnf.git] / VNFs / DPPD-PROX / handle_gen.c
diff --git a/VNFs/DPPD-PROX/handle_gen.c b/VNFs/DPPD-PROX/handle_gen.c

index 0e5164b..fcdbcd6 100644 (file)
--- a/VNFs/DPPD-PROX/handle_gen.c
+++ b/VNFs/DPPD-PROX/handle_gen.c
@@ -57,7 +57,6 @@ struct pkt_template {
  
  #define MAX_TEMPLATE_INDEX     65536
  #define TEMPLATE_INDEX_MASK    (MAX_TEMPLATE_INDEX - 1)
-#define MBUF_ARP               MAX_TEMPLATE_INDEX
  
  #define IP4(x) x & 0xff, (x >> 8) & 0xff, (x >> 16) & 0xff, x >> 24
  
@@ -86,7 +85,6 @@ struct task_gen_pcap {
  struct task_gen {
         struct task_base base;
         uint64_t hz;
-       uint64_t link_speed;
         struct token_time token_time;
         struct local_mbuf local_mbuf;
         struct pkt_template *pkt_template; /* packet templates used at runtime */
@@ -124,6 +122,7 @@ struct task_gen {
         uint8_t flags;
         uint8_t cksum_offload;
         struct prox_port_cfg *port;
+       uint64_t *bytes_to_tsc;
  } __rte_cache_aligned;
  
  static inline uint8_t ipv4_get_hdr_len(struct ipv4_hdr *ip)
@@ -262,15 +261,9 @@ static int handle_gen_pcap_bulk(struct task_base *tbase, struct rte_mbuf **mbuf,
         return task->base.tx_pkt(&task->base, new_pkts, send_bulk, NULL);
  }
  
-static uint64_t bytes_to_tsc(struct task_gen *task, uint32_t bytes)
+static inline uint64_t bytes_to_tsc(struct task_gen *task, uint32_t bytes)
  {
-       const uint64_t hz = task->hz;
-       const uint64_t bytes_per_hz = task->link_speed;
-
-       if (bytes_per_hz == UINT64_MAX)
-               return 0;
-
-       return hz * bytes / bytes_per_hz;
+       return task->bytes_to_tsc[bytes];
  }
  
  static uint32_t task_gen_next_pkt_idx(const struct task_gen *task, uint32_t pkt_idx)
@@ -360,9 +353,10 @@ static void task_gen_apply_accur_pos(struct task_gen *task, uint8_t *pkt_hdr, ui
         *(uint32_t *)(pkt_hdr + task->accur_pos) = accuracy;
  }
  
-static void task_gen_apply_sig(struct task_gen *task, uint8_t *pkt_hdr)
+static void task_gen_apply_sig(struct task_gen *task, struct pkt_template *dst)
  {
-       *(uint32_t *)(pkt_hdr + task->sig_pos) = task->sig;
+       if (task->sig_pos)
+               *(uint32_t *)(dst->buf + task->sig_pos) = task->sig;
  }
  
  static void task_gen_apply_all_accur_pos(struct task_gen *task, struct rte_mbuf **mbufs, uint8_t **pkt_hdr, uint32_t count)
@@ -374,22 +368,8 @@ static void task_gen_apply_all_accur_pos(struct task_gen *task, struct rte_mbuf
            packet task->pkt_queue_index. The ID modulo 64 is the
            same. */
         for (uint16_t j = 0; j < count; ++j) {
-               if ((mbufs[j]->udata64 & MBUF_ARP) == 0) {
-                       uint32_t accuracy = task->accur[(task->pkt_queue_index + j) & 63];
-                       task_gen_apply_accur_pos(task, pkt_hdr[j], accuracy);
-               }
-       }
-}
-
-static void task_gen_apply_all_sig(struct task_gen *task, struct rte_mbuf **mbufs, uint8_t **pkt_hdr, uint32_t count)
-{
-       if (!task->sig_pos)
-               return;
-
-       for (uint16_t j = 0; j < count; ++j) {
-               if ((mbufs[j]->udata64 & MBUF_ARP) == 0) {
-                       task_gen_apply_sig(task, pkt_hdr[j]);
-               }
+               uint32_t accuracy = task->accur[(task->pkt_queue_index + j) & 63];
+               task_gen_apply_accur_pos(task, pkt_hdr[j], accuracy);
         }
  }
  
@@ -406,11 +386,9 @@ static void task_gen_apply_all_unique_id(struct task_gen *task, struct rte_mbuf
                 return;
  
         for (uint16_t i = 0; i < count; ++i) {
-               if ((mbufs[i]->udata64 & MBUF_ARP) == 0) {
-                       struct unique_id id;
-                       unique_id_init(&id, task->generator_id, task->pkt_queue_index++);
-                       task_gen_apply_unique_id(task, pkt_hdr[i], &id);
-               }
+               struct unique_id id;
+               unique_id_init(&id, task->generator_id, task->pkt_queue_index++);
+               task_gen_apply_unique_id(task, pkt_hdr[i], &id);
         }
  }
  
@@ -424,11 +402,9 @@ static void task_gen_checksum_packets(struct task_gen *task, struct rte_mbuf **m
  
         uint32_t pkt_idx = task_gen_offset_pkt_idx(task, - count);
         for (uint16_t i = 0; i < count; ++i) {
-               if ((mbufs[i]->udata64 & MBUF_ARP) == 0) {
-                       struct pkt_template *pkt_template = &task->pkt_template[pkt_idx];
-                       checksum_packet(pkt_hdr[i], mbufs[i], pkt_template, task->cksum_offload);
-                       pkt_idx = task_gen_next_pkt_idx(task, pkt_idx);
-               }
+               struct pkt_template *pkt_template = &task->pkt_template[pkt_idx];
+               checksum_packet(pkt_hdr[i], mbufs[i], pkt_template, task->cksum_offload);
+               pkt_idx = task_gen_next_pkt_idx(task, pkt_idx);
         }
  }
  
@@ -448,8 +424,12 @@ static uint64_t task_gen_calc_bulk_duration(struct task_gen *task, uint32_t coun
         uint32_t pkt_idx = task_gen_offset_pkt_idx(task, - 1);
         struct pkt_template *last_pkt_template = &task->pkt_template[pkt_idx];
         uint32_t last_pkt_len = pkt_len_to_wire_size(last_pkt_template->len);
+#ifdef NO_EXTRAPOLATION
+       uint64_t bulk_duration = task->pkt_tsc_offset[count - 1];
+#else
         uint64_t last_pkt_duration = bytes_to_tsc(task, last_pkt_len);
         uint64_t bulk_duration = task->pkt_tsc_offset[count - 1] + last_pkt_duration;
+#endif
  
         return bulk_duration;
  }
@@ -484,6 +464,14 @@ static uint64_t task_gen_write_latency(struct task_gen *task, uint8_t **pkt_hdr,
            simply sleeping until delta_t is zero would leave a period
            of silence on the line. The error has been introduced
            earlier, but the packets have already been sent. */
+
+       /* This happens typically if previous bulk was delayed
+          by an interrupt e.g.  (with Time in nsec)
+          Time x: sleep 4 microsec
+          Time x+4000: send 64 packets (64 packets as 4000 nsec, w/ 10Gbps 64 bytes)
+          Time x+5000: send 16 packets (16 packets as 1000 nsec)
+          When we send the 16 packets, the 64 ealier packets are not yet
+          fully sent */
         if (tx_tsc < task->earliest_tsc_next_pkt)
                 delta_t = task->earliest_tsc_next_pkt - tx_tsc;
         else
@@ -492,12 +480,10 @@ static uint64_t task_gen_write_latency(struct task_gen *task, uint8_t **pkt_hdr,
         for (uint16_t i = 0; i < count; ++i) {
                 uint32_t *pos = (uint32_t *)(pkt_hdr[i] + task->lat_pos);
                 const uint64_t pkt_tsc = tx_tsc + delta_t + task->pkt_tsc_offset[i];
-
                 *pos = pkt_tsc >> LATENCY_ACCURACY;
         }
  
         uint64_t bulk_duration = task_gen_calc_bulk_duration(task, count);
-
         task->earliest_tsc_next_pkt = tx_tsc + delta_t + bulk_duration;
         write_tsc_after = rte_rdtsc();
         task->write_duration_estimate = write_tsc_after - write_tsc_before;
@@ -507,6 +493,7 @@ static uint64_t task_gen_write_latency(struct task_gen *task, uint8_t **pkt_hdr,
         do {
                 tsc_before_tx = rte_rdtsc();
         } while (tsc_before_tx < tx_tsc);
+
         return tsc_before_tx;
  }
  
@@ -546,7 +533,11 @@ static void task_gen_build_packets(struct task_gen *task, struct rte_mbuf **mbuf
                 mbufs[i]->udata64 = task->pkt_idx & TEMPLATE_INDEX_MASK;
                 struct ether_hdr *hdr = (struct ether_hdr *)pkt_hdr[i];
                 if (task->lat_enabled) {
+#ifdef NO_EXTRAPOLATION
+                       task->pkt_tsc_offset[i] = 0;
+#else
                         task->pkt_tsc_offset[i] = bytes_to_tsc(task, will_send_bytes);
+#endif
                         will_send_bytes += pkt_len_to_wire_size(pkt_template->len);
                 }
                 task->pkt_idx = task_gen_next_pkt_idx(task, task->pkt_idx);
@@ -642,16 +633,6 @@ static int handle_gen_bulk(struct task_base *tbase, struct rte_mbuf **mbufs, uin
  
         int i, j;
  
-       // If link is down, link_speed is 0
-       if (unlikely(task->link_speed == 0)) {
-               if (task->port && task->port->link_speed != 0) {
-                       task->link_speed = task->port->link_speed * 125000L;
-                       plog_info("\tPort %u: link speed is %ld Mbps\n",
-                               (uint8_t)(task->port - prox_port_cfg), 8 * task->link_speed / 1000000);
-               } else
-                       return 0;
-       }
-
         task_gen_update_config(task);
  
         if (task->pkt_count == 0) {
@@ -680,7 +661,6 @@ static int handle_gen_bulk(struct task_base *tbase, struct rte_mbuf **mbufs, uin
         task_gen_build_packets(task, new_pkts, pkt_hdr, send_bulk);
         task_gen_apply_all_random_fields(task, pkt_hdr, send_bulk);
         task_gen_apply_all_accur_pos(task, new_pkts, pkt_hdr, send_bulk);
-       task_gen_apply_all_sig(task, new_pkts, pkt_hdr, send_bulk);
         task_gen_apply_all_unique_id(task, new_pkts, pkt_hdr, send_bulk);
  
         uint64_t tsc_before_tx;
@@ -918,6 +898,7 @@ static void task_gen_reset_pkt_templates_content(struct task_gen *task)
                 src = &task->pkt_template_orig[i];
                 dst = &task->pkt_template[i];
                 memcpy(dst->buf, src->buf, dst->len);
+               task_gen_apply_sig(task, dst);
         }
  }
  
@@ -1005,15 +986,20 @@ static struct rte_mempool *task_gen_create_mempool(struct task_args *targ, uint1
         const int sock_id = rte_lcore_to_socket_id(targ->lconf->id);
  
         name[0]++;
-       uint32_t mbuf_size = MBUF_SIZE;
+       uint32_t mbuf_size = TX_MBUF_SIZE;
         if (max_frame_size + (unsigned)sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM > mbuf_size)
                 mbuf_size = max_frame_size + (unsigned)sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM;
+       plog_info("\t\tCreating mempool with name '%s'\n", name);
         ret = rte_mempool_create(name, targ->nb_mbuf - 1, mbuf_size,
                                  targ->nb_cache_mbuf, sizeof(struct rte_pktmbuf_pool_private),
                                  rte_pktmbuf_pool_init, NULL, rte_pktmbuf_init, 0,
                                  sock_id, 0);
         PROX_PANIC(ret == NULL, "Failed to allocate dummy memory pool on socket %u with %u elements\n",
                    sock_id, targ->nb_mbuf - 1);
+
+        plog_info("\t\tMempool %p size = %u * %u cache %u, socket %d\n", ret,
+                  targ->nb_mbuf - 1, mbuf_size, targ->nb_cache_mbuf, sock_id);
+
         return ret;
  }
  
@@ -1189,18 +1175,7 @@ static void start(struct task_base *tbase)
         if (tbase->l3.tmaster) {
                 register_all_ip_to_ctrl_plane(task);
         }
-       if (task->port) {
-               // task->port->link_speed reports the link speed in Mbps e.g. 40k for a 40 Gbps NIC.
-               // task->link_speed reports link speed in Bytes per sec.
-               // It can be 0 if link is down, and must hence be updated in fast path.
-               task->link_speed = task->port->link_speed * 125000L;
-               if (task->link_speed)
-                       plog_info("\tPort %u: link speed is %ld Mbps\n",
-                               (uint8_t)(task->port - prox_port_cfg), 8 * task->link_speed / 1000000);
-               else
-                       plog_info("\tPort %u: link speed is %ld Mbps - link might be down\n",
-                               (uint8_t)(task->port - prox_port_cfg), 8 * task->link_speed / 1000000);
-       }
+
         /* TODO
            Handle the case when two tasks transmit to the same port
            and one of them is stopped. In that case ARP (requests or replies)
@@ -1241,7 +1216,7 @@ static void init_task_gen(struct task_base *tbase, struct task_args *targ)
         struct prox_port_cfg *port = find_reachable_port(targ);
         // TODO: check that all reachable ports have the same mtu...
         if (port) {
-               task->cksum_offload = port->capabilities.tx_offload_cksum;
+               task->cksum_offload = port->requested_tx_offload & (DEV_TX_OFFLOAD_IPV4_CKSUM | DEV_TX_OFFLOAD_UDP_CKSUM);
                 task->port = port;
                 task->max_frame_size = port->mtu + ETHER_HDR_LEN + 2 * PROX_VLAN_TAG_SIZE;
         } else {
@@ -1286,7 +1261,26 @@ static void init_task_gen(struct task_base *tbase, struct task_args *targ)
  
         task->generator_id = targ->generator_id;
         plog_info("\tGenerator id = %d\n", task->generator_id);
-       task->link_speed = UINT64_MAX;
+
+       // Allocate array holding bytes to tsc for supported frame sizes
+       task->bytes_to_tsc = prox_zmalloc(task->max_frame_size * MAX_PKT_BURST * sizeof(task->bytes_to_tsc[0]), rte_lcore_to_socket_id(targ->lconf->id));
+       PROX_PANIC(task->bytes_to_tsc == NULL,
+               "Failed to allocate %u bytes (in huge pages) for bytes_to_tsc\n", task->max_frame_size);
+
+       // task->port->max_link_speed reports the maximum, non negotiated ink speed in Mbps e.g. 40k for a 40 Gbps NIC.
+       // It can be UINT32_MAX (virtual devices or not supported by DPDK < 16.04)
+       uint64_t bytes_per_hz = UINT64_MAX;
+       if ((task->port) && (task->port->max_link_speed != UINT32_MAX)) {
+               bytes_per_hz = task->port->max_link_speed * 125000L;
+               plog_info("\tPort %u: max link speed is %ld Mbps\n",
+                       (uint8_t)(task->port - prox_port_cfg), 8 * bytes_per_hz / 1000000);
+       }
+       for (unsigned int i = 0; i < task->max_frame_size * MAX_PKT_BURST ; i++) {
+               if (bytes_per_hz == UINT64_MAX)
+                       task->bytes_to_tsc[i] = 0;
+               else
+                       task->bytes_to_tsc[i] = (task->hz * i) / bytes_per_hz;
+       }
  
         if (!strcmp(targ->pcap_file, "")) {
                 plog_info("\tUsing inline definition of a packet\n");
@@ -1296,7 +1290,8 @@ static void init_task_gen(struct task_base *tbase, struct task_args *targ)
                 task_init_gen_load_pcap(task, targ);
         }
  
-       if ((targ->flags & DSF_KEEP_SRC_MAC) == 0 && (targ->nb_txrings || targ->nb_txports)) {
+       PROX_PANIC(((targ->nb_txrings == 0) && (targ->nb_txports == 0)), "Gen mode requires a tx ring or a tx port");
+       if ((targ->flags & DSF_KEEP_SRC_MAC) == 0) {
                 uint8_t *src_addr = prox_port_cfg[tbase->tx_params_hw.tx_port_queue->port].eth_addr.addr_bytes;
                 for (uint32_t i = 0; i < task->n_pkts; ++i) {
                         rte_memcpy(&task->pkt_template[i].buf[6], src_addr, 6);
@@ -1318,7 +1313,7 @@ static struct task_init task_init_gen = {
  #ifdef SOFT_CRC
         // For SOFT_CRC, no offload is needed. If both NOOFFLOADS and NOMULTSEGS flags are set the
         // vector mode is used by DPDK, resulting (theoretically) in higher performance.
-       .flag_features = TASK_FEATURE_NEVER_DISCARDS | TASK_FEATURE_NO_RX | TASK_FEATURE_TXQ_FLAGS_NOOFFLOADS | TASK_FEATURE_TXQ_FLAGS_NOMULTSEGS,
+       .flag_features = TASK_FEATURE_NEVER_DISCARDS | TASK_FEATURE_NO_RX | TASK_FEATURE_TXQ_FLAGS_NOOFFLOADS,
  #else
         .flag_features = TASK_FEATURE_NEVER_DISCARDS | TASK_FEATURE_NO_RX,
  #endif
@@ -1335,7 +1330,7 @@ static struct task_init task_init_gen_l3 = {
  #ifdef SOFT_CRC
         // For SOFT_CRC, no offload is needed. If both NOOFFLOADS and NOMULTSEGS flags are set the
         // vector mode is used by DPDK, resulting (theoretically) in higher performance.
-       .flag_features = TASK_FEATURE_NEVER_DISCARDS | TASK_FEATURE_NO_RX | TASK_FEATURE_TXQ_FLAGS_NOOFFLOADS | TASK_FEATURE_TXQ_FLAGS_NOMULTSEGS,
+       .flag_features = TASK_FEATURE_NEVER_DISCARDS | TASK_FEATURE_NO_RX | TASK_FEATURE_TXQ_FLAGS_NOOFFLOADS,
  #else
         .flag_features = TASK_FEATURE_NEVER_DISCARDS | TASK_FEATURE_NO_RX,
  #endif
@@ -1350,7 +1345,7 @@ static struct task_init task_init_gen_pcap = {
         .start = start_pcap,
         .early_init = init_task_gen_early,
  #ifdef SOFT_CRC
-       .flag_features = TASK_FEATURE_NEVER_DISCARDS | TASK_FEATURE_NO_RX | TASK_FEATURE_TXQ_FLAGS_NOOFFLOADS | TASK_FEATURE_TXQ_FLAGS_NOMULTSEGS,
+       .flag_features = TASK_FEATURE_NEVER_DISCARDS | TASK_FEATURE_NO_RX | TASK_FEATURE_TXQ_FLAGS_NOOFFLOADS,
  #else
         .flag_features = TASK_FEATURE_NEVER_DISCARDS | TASK_FEATURE_NO_RX,
  #endif