2 // Copyright (c) 2010-2020 Intel Corporation
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
17 #include <rte_cycles.h>
18 #include <rte_ethdev.h>
19 #include <rte_version.h>
22 #include "task_base.h"
26 #include "mbuf_utils.h"
30 #include "handle_master.h"
31 #include "input.h" /* Needed for callback on dump */
33 #define TCP_PORT_BGP rte_cpu_to_be_16(179)
35 /* _param version of the rx_pkt_hw functions are used to create two
36 instances of very similar variations of these functions. The
37 variations are specified by the "multi" parameter which significies
38 that the rte_eth_rx_burst function should be called multiple times.
39 The reason for this is that with the vector PMD, the maximum number
40 of packets being returned is 32. If packets have been split in
41 multiple mbufs then rte_eth_rx_burst might even receive less than
43 Some algorithms (like QoS) only work correctly if more than 32
44 packets are received if the dequeue step involves finding 32 packets.
49 static uint16_t rx_pkt_hw_port_queue(struct port_queue *pq, struct rte_mbuf **mbufs, int multi)
53 nb_rx = rte_eth_rx_burst(pq->port, pq->queue, mbufs, MAX_PKT_BURST);
57 while (n != 0 && MAX_PKT_BURST - nb_rx >= MIN_PMD_RX) {
58 n = rte_eth_rx_burst(pq->port, pq->queue, mbufs + nb_rx, MIN_PMD_RX);
60 PROX_PANIC(nb_rx > 64, "Received %d packets while expecting maximum %d\n", n, MIN_PMD_RX);
66 static void next_port(struct rx_params_hw *rx_params_hw)
68 ++rx_params_hw->last_read_portid;
69 if (unlikely(rx_params_hw->last_read_portid == rx_params_hw->nb_rxports)) {
70 rx_params_hw->last_read_portid = 0;
74 static void next_port_pow2(struct rx_params_hw *rx_params_hw)
76 rx_params_hw->last_read_portid = (rx_params_hw->last_read_portid + 1) & rx_params_hw->rxport_mask;
79 static inline void dump_l3(struct task_base *tbase, struct rte_mbuf *mbuf)
81 if (unlikely(tbase->aux->task_rt_dump.n_print_rx)) {
82 if ((tbase->aux->task_rt_dump.input == NULL) || (tbase->aux->task_rt_dump.input->reply == NULL)) {
83 plogdx_info(mbuf, "RX: ");
85 struct input *input = tbase->aux->task_rt_dump.input;
88 #if RTE_VERSION >= RTE_VERSION_NUM(1,8,0,0)
89 int port_id = mbuf->port;
91 int port_id = mbuf->pkt.in_port;
93 strlen = snprintf(tmp, sizeof(tmp), "pktdump,%d,%d\n", port_id,
94 rte_pktmbuf_pkt_len(mbuf));
95 input->reply(input, tmp, strlen);
96 input->reply(input, rte_pktmbuf_mtod(mbuf, char *), rte_pktmbuf_pkt_len(mbuf));
97 input->reply(input, "\n", 1);
99 tbase->aux->task_rt_dump.n_print_rx --;
100 if (0 == tbase->aux->task_rt_dump.n_print_rx) {
101 task_base_del_rx_pkt_function(tbase, rx_pkt_dump);
104 if (unlikely(tbase->aux->task_rt_dump.n_trace)) {
105 plogdx_info(mbuf, "RX: ");
106 tbase->aux->task_rt_dump.n_trace--;
110 static uint16_t rx_pkt_hw_param(struct task_base *tbase, struct rte_mbuf ***mbufs_ptr, int multi,
111 void (*next)(struct rx_params_hw *rx_param_hw), int l3)
113 uint8_t last_read_portid;
117 START_EMPTY_MEASSURE();
118 *mbufs_ptr = tbase->ws_mbuf->mbuf[0] +
119 (RTE_ALIGN_CEIL(tbase->ws_mbuf->idx[0].prod, 2) & WS_MBUF_MASK);
121 last_read_portid = tbase->rx_params_hw.last_read_portid;
122 struct port_queue *pq = &tbase->rx_params_hw.rx_pq[last_read_portid];
124 nb_rx = rx_pkt_hw_port_queue(pq, *mbufs_ptr, multi);
125 next(&tbase->rx_params_hw);
128 struct rte_mbuf **mbufs = *mbufs_ptr;
130 struct ether_hdr_arp *hdr_arp[MAX_PKT_BURST];
131 prox_rte_ether_hdr *hdr;
132 for (i = 0; i < nb_rx; i++) {
135 for (i = 0; i < nb_rx; i++) {
136 hdr_arp[i] = rte_pktmbuf_mtod(mbufs[i], struct ether_hdr_arp *);
137 PREFETCH0(hdr_arp[i]);
139 for (i = 0; i < nb_rx; i++) {
140 if (likely(hdr_arp[i]->ether_hdr.ether_type == ETYPE_IPv4)) {
141 hdr = (prox_rte_ether_hdr *)hdr_arp[i];
142 prox_rte_ipv4_hdr *pip = (prox_rte_ipv4_hdr *)(hdr + 1);
143 prox_rte_tcp_hdr *tcp = (prox_rte_tcp_hdr *)(pip + 1);
144 if (pip->next_proto_id == IPPROTO_ICMP) {
145 dump_l3(tbase, mbufs[i]);
146 tx_ring(tbase, tbase->l3.ctrl_plane_ring, ICMP_TO_CTRL, mbufs[i]);
148 } else if ((tcp->src_port == TCP_PORT_BGP) || (tcp->dst_port == TCP_PORT_BGP)) {
149 dump_l3(tbase, mbufs[i]);
150 tx_ring(tbase, tbase->l3.ctrl_plane_ring, BGP_TO_CTRL, mbufs[i]);
152 } else if (unlikely(skip)) {
153 mbufs[i - skip] = mbufs[i];
155 } else if (unlikely(hdr_arp[i]->ether_hdr.ether_type == ETYPE_ARP)) {
156 dump_l3(tbase, mbufs[i]);
157 tx_ring(tbase, tbase->l3.ctrl_plane_ring, ARP_TO_CTRL, mbufs[i]);
159 } else if (unlikely(skip)) {
160 mbufs[i - skip] = mbufs[i];
166 TASK_STATS_ADD_RX_NON_DP(&tbase->aux->stats, skip);
167 if (likely(nb_rx > 0)) {
168 TASK_STATS_ADD_RX(&tbase->aux->stats, nb_rx);
171 TASK_STATS_ADD_IDLE(&tbase->aux->stats, rte_rdtsc() - cur_tsc);
175 static inline uint16_t rx_pkt_hw1_param(struct task_base *tbase, struct rte_mbuf ***mbufs_ptr, int multi, int l3)
180 START_EMPTY_MEASSURE();
181 *mbufs_ptr = tbase->ws_mbuf->mbuf[0] +
182 (RTE_ALIGN_CEIL(tbase->ws_mbuf->idx[0].prod, 2) & WS_MBUF_MASK);
184 nb_rx = rte_eth_rx_burst(tbase->rx_params_hw1.rx_pq.port,
185 tbase->rx_params_hw1.rx_pq.queue,
186 *mbufs_ptr, MAX_PKT_BURST);
190 while ((n != 0) && (MAX_PKT_BURST - nb_rx >= MIN_PMD_RX)) {
191 n = rte_eth_rx_burst(tbase->rx_params_hw1.rx_pq.port,
192 tbase->rx_params_hw1.rx_pq.queue,
193 *mbufs_ptr + nb_rx, MIN_PMD_RX);
195 PROX_PANIC(nb_rx > 64, "Received %d packets while expecting maximum %d\n", n, MIN_PMD_RX);
200 struct rte_mbuf **mbufs = *mbufs_ptr;
202 struct ether_hdr_arp *hdr_arp[MAX_PKT_BURST];
203 prox_rte_ether_hdr *hdr;
204 for (i = 0; i < nb_rx; i++) {
207 for (i = 0; i < nb_rx; i++) {
208 hdr_arp[i] = rte_pktmbuf_mtod(mbufs[i], struct ether_hdr_arp *);
209 PREFETCH0(hdr_arp[i]);
211 for (i = 0; i < nb_rx; i++) {
212 // plog_info("ether_type = %x\n", hdr_arp[i]->ether_hdr.ether_type);
213 if (likely(hdr_arp[i]->ether_hdr.ether_type == ETYPE_IPv4)) {
214 hdr = (prox_rte_ether_hdr *)hdr_arp[i];
215 prox_rte_ipv4_hdr *pip = (prox_rte_ipv4_hdr *)(hdr + 1);
216 prox_rte_tcp_hdr *tcp = (prox_rte_tcp_hdr *)(pip + 1);
217 if (pip->next_proto_id == IPPROTO_ICMP) {
218 dump_l3(tbase, mbufs[i]);
219 tx_ring(tbase, tbase->l3.ctrl_plane_ring, ICMP_TO_CTRL, mbufs[i]);
221 } else if ((tcp->src_port == TCP_PORT_BGP) || (tcp->dst_port == TCP_PORT_BGP)) {
222 dump_l3(tbase, mbufs[i]);
223 tx_ring(tbase, tbase->l3.ctrl_plane_ring, BGP_TO_CTRL, mbufs[i]);
225 } else if (unlikely(skip)) {
226 mbufs[i - skip] = mbufs[i];
228 } else if (unlikely(hdr_arp[i]->ether_hdr.ether_type == ETYPE_ARP)) {
229 dump_l3(tbase, mbufs[i]);
230 tx_ring(tbase, tbase->l3.ctrl_plane_ring, ARP_TO_CTRL, mbufs[i]);
232 } else if (unlikely(skip)) {
233 mbufs[i - skip] = mbufs[i];
239 TASK_STATS_ADD_RX_NON_DP(&tbase->aux->stats, skip);
240 if (likely(nb_rx > 0)) {
241 TASK_STATS_ADD_RX(&tbase->aux->stats, nb_rx);
244 TASK_STATS_ADD_IDLE(&tbase->aux->stats, rte_rdtsc() - cur_tsc);
248 uint16_t rx_pkt_hw(struct task_base *tbase, struct rte_mbuf ***mbufs)
250 return rx_pkt_hw_param(tbase, mbufs, 0, next_port, 0);
253 uint16_t rx_pkt_hw_pow2(struct task_base *tbase, struct rte_mbuf ***mbufs)
255 return rx_pkt_hw_param(tbase, mbufs, 0, next_port_pow2, 0);
258 uint16_t rx_pkt_hw1(struct task_base *tbase, struct rte_mbuf ***mbufs)
260 return rx_pkt_hw1_param(tbase, mbufs, 0, 0);
263 uint16_t rx_pkt_hw_multi(struct task_base *tbase, struct rte_mbuf ***mbufs)
265 return rx_pkt_hw_param(tbase, mbufs, 1, next_port, 0);
268 uint16_t rx_pkt_hw_pow2_multi(struct task_base *tbase, struct rte_mbuf ***mbufs)
270 return rx_pkt_hw_param(tbase, mbufs, 1, next_port_pow2, 0);
273 uint16_t rx_pkt_hw1_multi(struct task_base *tbase, struct rte_mbuf ***mbufs)
275 return rx_pkt_hw1_param(tbase, mbufs, 1, 0);
278 uint16_t rx_pkt_hw_l3(struct task_base *tbase, struct rte_mbuf ***mbufs)
280 return rx_pkt_hw_param(tbase, mbufs, 0, next_port, 1);
283 uint16_t rx_pkt_hw_pow2_l3(struct task_base *tbase, struct rte_mbuf ***mbufs)
285 return rx_pkt_hw_param(tbase, mbufs, 0, next_port_pow2, 1);
288 uint16_t rx_pkt_hw1_l3(struct task_base *tbase, struct rte_mbuf ***mbufs)
290 return rx_pkt_hw1_param(tbase, mbufs, 0, 1);
293 uint16_t rx_pkt_hw_multi_l3(struct task_base *tbase, struct rte_mbuf ***mbufs)
295 return rx_pkt_hw_param(tbase, mbufs, 1, next_port, 1);
298 uint16_t rx_pkt_hw_pow2_multi_l3(struct task_base *tbase, struct rte_mbuf ***mbufs)
300 return rx_pkt_hw_param(tbase, mbufs, 1, next_port_pow2, 1);
303 uint16_t rx_pkt_hw1_multi_l3(struct task_base *tbase, struct rte_mbuf ***mbufs)
305 return rx_pkt_hw1_param(tbase, mbufs, 1, 1);
308 /* The following functions implement ring access */
309 uint16_t ring_deq(struct rte_ring *r, struct rte_mbuf **mbufs)
311 void **v_mbufs = (void **)mbufs;
313 #if RTE_VERSION < RTE_VERSION_NUM(17,5,0,1)
314 return rte_ring_sc_dequeue_bulk(r, v_mbufs, MAX_RING_BURST) < 0? 0 : MAX_RING_BURST;
316 return rte_ring_sc_dequeue_bulk(r, v_mbufs, MAX_RING_BURST, NULL);
319 #if RTE_VERSION < RTE_VERSION_NUM(17,5,0,1)
320 return rte_ring_sc_dequeue_burst(r, v_mbufs, MAX_RING_BURST);
322 return rte_ring_sc_dequeue_burst(r, v_mbufs, MAX_RING_BURST, NULL);
327 uint16_t rx_pkt_sw(struct task_base *tbase, struct rte_mbuf ***mbufs)
329 START_EMPTY_MEASSURE();
330 *mbufs = tbase->ws_mbuf->mbuf[0] + (tbase->ws_mbuf->idx[0].prod & WS_MBUF_MASK);
331 uint8_t lr = tbase->rx_params_sw.last_read_ring;
335 nb_rx = ring_deq(tbase->rx_params_sw.rx_rings[lr], *mbufs);
336 lr = lr + 1 == tbase->rx_params_sw.nb_rxrings? 0 : lr + 1;
337 } while(!nb_rx && lr != tbase->rx_params_sw.last_read_ring);
339 tbase->rx_params_sw.last_read_ring = lr;
342 TASK_STATS_ADD_RX(&tbase->aux->stats, nb_rx);
346 TASK_STATS_ADD_IDLE(&tbase->aux->stats, rte_rdtsc() - cur_tsc);
351 /* Same as rx_pkt_sw expect with a mask for the number of receive
352 rings (can only be used if nb_rxring is a power of 2). */
353 uint16_t rx_pkt_sw_pow2(struct task_base *tbase, struct rte_mbuf ***mbufs)
355 START_EMPTY_MEASSURE();
356 *mbufs = tbase->ws_mbuf->mbuf[0] + (tbase->ws_mbuf->idx[0].prod & WS_MBUF_MASK);
357 uint8_t lr = tbase->rx_params_sw.last_read_ring;
361 nb_rx = ring_deq(tbase->rx_params_sw.rx_rings[lr], *mbufs);
362 lr = (lr + 1) & tbase->rx_params_sw.rxrings_mask;
363 } while(!nb_rx && lr != tbase->rx_params_sw.last_read_ring);
365 tbase->rx_params_sw.last_read_ring = lr;
368 TASK_STATS_ADD_RX(&tbase->aux->stats, nb_rx);
372 TASK_STATS_ADD_IDLE(&tbase->aux->stats, rte_rdtsc() - cur_tsc);
377 uint16_t rx_pkt_self(struct task_base *tbase, struct rte_mbuf ***mbufs)
379 START_EMPTY_MEASSURE();
380 uint16_t nb_rx = tbase->ws_mbuf->idx[0].nb_rx;
382 tbase->ws_mbuf->idx[0].nb_rx = 0;
383 *mbufs = tbase->ws_mbuf->mbuf[0] + (tbase->ws_mbuf->idx[0].prod & WS_MBUF_MASK);
384 TASK_STATS_ADD_RX(&tbase->aux->stats, nb_rx);
388 TASK_STATS_ADD_IDLE(&tbase->aux->stats, rte_rdtsc() - cur_tsc);
393 /* Used for tasks that do not receive packets (i.e. Packet
394 generation). Always returns 1 but never returns packets and does not
395 increment statistics. This function allows to use the same code path
396 as for tasks that actually receive packets. */
397 uint16_t rx_pkt_dummy(__attribute__((unused)) struct task_base *tbase,
398 __attribute__((unused)) struct rte_mbuf ***mbufs)
403 /* After the system has been configured, it is known if there is only
404 one RX ring. If this is the case, a more specialized version of the
405 function above can be used to save cycles. */
406 uint16_t rx_pkt_sw1(struct task_base *tbase, struct rte_mbuf ***mbufs)
408 START_EMPTY_MEASSURE();
409 *mbufs = tbase->ws_mbuf->mbuf[0] + (tbase->ws_mbuf->idx[0].prod & WS_MBUF_MASK);
410 uint16_t nb_rx = ring_deq(tbase->rx_params_sw1.rx_ring, *mbufs);
413 TASK_STATS_ADD_RX(&tbase->aux->stats, nb_rx);
417 TASK_STATS_ADD_IDLE(&tbase->aux->stats, rte_rdtsc() - cur_tsc);
422 static uint16_t call_prev_rx_pkt(struct task_base *tbase, struct rte_mbuf ***mbufs)
426 tbase->aux->rx_prev_idx++;
427 ret = tbase->aux->rx_pkt_prev[tbase->aux->rx_prev_idx - 1](tbase, mbufs);
428 tbase->aux->rx_prev_idx--;
433 /* Only used when there are packets to be dumped. This function is
434 meant as a debugging tool and is therefore not optimized. When the
435 number of packets to dump falls back to 0, the original (optimized)
436 rx function is restored. This allows to support dumping packets
437 without any performance impact if the feature is not used. */
438 uint16_t rx_pkt_dump(struct task_base *tbase, struct rte_mbuf ***mbufs)
440 uint16_t ret = call_prev_rx_pkt(tbase, mbufs);
443 uint32_t n_dump = tbase->aux->task_rt_dump.n_print_rx;
444 n_dump = ret < n_dump? ret : n_dump;
446 if ((tbase->aux->task_rt_dump.input == NULL) || (tbase->aux->task_rt_dump.input->reply == NULL)) {
447 for (uint32_t i = 0; i < n_dump; ++i) {
448 plogdx_info((*mbufs)[i], "RX: ");
452 struct input *input = tbase->aux->task_rt_dump.input;
454 for (uint32_t i = 0; i < n_dump; ++i) {
455 /* TODO: Execute callback with full
456 data in a single call. */
460 #if RTE_VERSION >= RTE_VERSION_NUM(1,8,0,0)
461 int port_id = ((*mbufs)[i])->port;
463 int port_id = ((*mbufs)[i])->pkt.in_port;
465 strlen = snprintf(tmp, sizeof(tmp), "pktdump,%d,%d\n", port_id,
466 rte_pktmbuf_pkt_len((*mbufs)[i]));
468 input->reply(input, tmp, strlen);
469 input->reply(input, rte_pktmbuf_mtod((*mbufs)[i], char *), rte_pktmbuf_pkt_len((*mbufs)[i]));
470 input->reply(input, "\n", 1);
474 tbase->aux->task_rt_dump.n_print_rx -= n_dump;
476 if (0 == tbase->aux->task_rt_dump.n_print_rx) {
477 task_base_del_rx_pkt_function(tbase, rx_pkt_dump);
483 uint16_t rx_pkt_trace(struct task_base *tbase, struct rte_mbuf ***mbufs)
485 tbase->aux->task_rt_dump.cur_trace = 0;
486 uint16_t ret = call_prev_rx_pkt(tbase, mbufs);
489 uint32_t n_trace = tbase->aux->task_rt_dump.n_trace;
490 n_trace = ret < n_trace? ret : n_trace;
491 n_trace = n_trace <= MAX_RING_BURST ? n_trace : MAX_RING_BURST;
493 for (uint32_t i = 0; i < n_trace; ++i) {
494 uint8_t *pkt = rte_pktmbuf_mtod((*mbufs)[i], uint8_t *);
495 rte_memcpy(tbase->aux->task_rt_dump.pkt_cpy[i], pkt, sizeof(tbase->aux->task_rt_dump.pkt_cpy[i]));
496 tbase->aux->task_rt_dump.pkt_cpy_len[i] = rte_pktmbuf_pkt_len((*mbufs)[i]);
497 tbase->aux->task_rt_dump.pkt_mbuf_addr[i] = (*mbufs)[i];
499 tbase->aux->task_rt_dump.cur_trace += n_trace;
501 tbase->aux->task_rt_dump.n_trace -= n_trace;
502 /* Unset by TX when n_trace = 0 */
507 /* Gather the distribution of the number of packets that have been
508 received from one RX call. Since the value is only modified by the
509 task that receives the packet, no atomic operation is needed. */
510 uint16_t rx_pkt_distr(struct task_base *tbase, struct rte_mbuf ***mbufs)
512 uint16_t ret = call_prev_rx_pkt(tbase, mbufs);
514 if (likely(ret < RX_BUCKET_SIZE))
515 tbase->aux->rx_bucket[ret]++;
517 tbase->aux->rx_bucket[RX_BUCKET_SIZE - 1]++;
521 uint16_t rx_pkt_bw(struct task_base *tbase, struct rte_mbuf ***mbufs)
523 uint16_t ret = call_prev_rx_pkt(tbase, mbufs);
524 uint32_t tot_bytes = 0;
526 for (uint16_t i = 0; i < ret; ++i) {
527 tot_bytes += mbuf_wire_size((*mbufs)[i]);
530 TASK_STATS_ADD_RX_BYTES(&tbase->aux->stats, tot_bytes);
535 uint16_t rx_pkt_tsc(struct task_base *tbase, struct rte_mbuf ***mbufs)
537 uint64_t before = rte_rdtsc();
538 uint16_t ret = call_prev_rx_pkt(tbase, mbufs);
539 uint64_t after = rte_rdtsc();
541 tbase->aux->tsc_rx.before = before;
542 tbase->aux->tsc_rx.after = after;