2e1616af17401e7f78dbe19e33528caf21fd36fc
[samplevnf.git] / VNFs / DPPD-PROX / main.c
1 /*
2 // Copyright (c) 2010-2017 Intel Corporation
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //     http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 */
16
17 #include <string.h>
18 #include <locale.h>
19 #include <unistd.h>
20 #include <signal.h>
21
22 #include <rte_cycles.h>
23 #include <rte_atomic.h>
24 #include <rte_table_hash.h>
25 #include <rte_memzone.h>
26 #include <rte_errno.h>
27
28 #include "prox_malloc.h"
29 #include "run.h"
30 #include "main.h"
31 #include "log.h"
32 #include "quit.h"
33 #include "clock.h"
34 #include "defines.h"
35 #include "version.h"
36 #include "prox_args.h"
37 #include "prox_assert.h"
38 #include "prox_cfg.h"
39 #include "prox_shared.h"
40 #include "prox_port_cfg.h"
41 #include "toeplitz.h"
42 #include "hash_utils.h"
43 #include "handle_lb_net.h"
44 #include "prox_cksum.h"
45 #include "thread_nop.h"
46 #include "thread_generic.h"
47 #include "thread_pipeline.h"
48 #include "cqm.h"
49 #include "handle_master.h"
50
51 #if RTE_VERSION < RTE_VERSION_NUM(1,8,0,0)
52 #define RTE_CACHE_LINE_SIZE CACHE_LINE_SIZE
53 #endif
54
55 uint8_t lb_nb_txrings = 0xff;
56 struct rte_ring *ctrl_rings[RTE_MAX_LCORE*MAX_TASKS_PER_CORE];
57
58 static void __attribute__((noreturn)) prox_usage(const char *prgname)
59 {
60         plog_info("\nUsage: %s [-f CONFIG_FILE] [-a|-e] [-m|-s|-i] [-w DEF] [-u] [-t]\n"
61                   "\t-f CONFIG_FILE : configuration file to load, ./prox.cfg by default\n"
62                   "\t-l LOG_FILE : log file name, ./prox.log by default\n"
63                   "\t-p : include PID in log file name if default log file is used\n"
64                   "\t-o DISPLAY: Set display to use, can be 'curses' (default), 'cli' or 'none'\n"
65                   "\t-v verbosity : initial logging verbosity\n"
66                   "\t-a : autostart all cores (by default)\n"
67                   "\t-e : don't autostart\n"
68                   "\t-n : Create NULL devices instead of using PCI devices, useful together with -i\n"
69                   "\t-m : list supported task modes and exit\n"
70                   "\t-s : check configuration file syntax and exit\n"
71                   "\t-i : check initialization sequence and exit\n"
72                   "\t-u : Listen on UDS /tmp/prox.sock\n"
73                   "\t-t : Listen on TCP port 8474\n"
74                   "\t-q : Pass argument to Lua interpreter, useful to define variables\n"
75                   "\t-w : define variable using syntax varname=value\n"
76                   "\t     takes precedence over variables defined in CONFIG_FILE\n"
77                   "\t-k : Log statistics to file \"stats_dump\" in current directory\n"
78                   "\t-d : Run as daemon, the parent process will block until PROX is not initialized\n"
79                   "\t-z : Ignore CPU topology, implies -i\n"
80                   "\t-r : Change initial screen refresh rate. If set to a lower than 0.001 seconds,\n"
81                   "\t     screen refreshing will be disabled\n"
82                   , prgname);
83         exit(EXIT_FAILURE);
84 }
85
86 static void check_mixed_normal_pipeline(void)
87 {
88         struct lcore_cfg *lconf = NULL;
89         uint32_t lcore_id = -1;
90
91         while (prox_core_next(&lcore_id, 0) == 0) {
92                 lconf = &lcore_cfg[lcore_id];
93
94                 int all_thread_nop = 1;
95                 int generic = 0;
96                 int pipeline = 0;
97                 int l3 = 0;
98                 for (uint8_t task_id = 0; task_id < lconf->n_tasks_all; ++task_id) {
99                         struct task_args *targ = &lconf->targs[task_id];
100                         l3 = !strcmp("l3", targ->sub_mode_str);
101                         all_thread_nop = all_thread_nop && !l3 &&
102                                 targ->task_init->thread_x == thread_nop;
103
104                         pipeline = pipeline || targ->task_init->thread_x == thread_pipeline;
105                         generic = generic || targ->task_init->thread_x == thread_generic || l3;
106                 }
107                 PROX_PANIC(generic && pipeline, "Can't run both pipeline and normal thread on same core\n");
108
109                 if (all_thread_nop)
110                         lconf->thread_x = thread_nop;
111                 else {
112                         lconf->thread_x = thread_generic;
113                 }
114         }
115 }
116
117 static void check_zero_rx(void)
118 {
119         struct lcore_cfg *lconf = NULL;
120         struct task_args *targ;
121
122         while (core_targ_next(&lconf, &targ, 0) == 0) {
123                 if (targ->nb_rxports != 0) {
124                         PROX_PANIC(task_init_flag_set(targ->task_init, TASK_FEATURE_NO_RX),
125                            "\tCore %u task %u: rx_ports configured while mode %s does not use it\n", lconf->id, targ->id, targ->task_init->mode_str);
126                 }
127         }
128 }
129
130 static void check_missing_rx(void)
131 {
132         struct lcore_cfg *lconf = NULL, *rx_lconf = NULL, *tx_lconf = NULL;
133         struct task_args *targ, *rx_targ = NULL, *tx_targ = NULL;
134         struct prox_port_cfg *port;
135         uint8_t port_id, rx_port_id, ok;
136
137         while (core_targ_next(&lconf, &targ, 0) == 0) {
138                 PROX_PANIC((targ->flags & TASK_ARG_RX_RING) && targ->rx_rings[0] == 0 && !targ->tx_opt_ring_task,
139                            "Configuration Error - Core %u task %u Receiving from ring, but nobody xmitting to this ring\n", lconf->id, targ->id);
140                 if (targ->nb_rxports == 0 && targ->nb_rxrings == 0) {
141                         PROX_PANIC(!task_init_flag_set(targ->task_init, TASK_FEATURE_NO_RX),
142                                    "\tCore %u task %u: no rx_ports and no rx_rings configured while required by mode %s\n", lconf->id, targ->id, targ->task_init->mode_str);
143                 }
144         }
145
146         lconf = NULL;
147         while (core_targ_next(&lconf, &targ, 0) == 0) {
148                 if (strcmp(targ->sub_mode_str, "l3") != 0)
149                         continue;
150
151                 PROX_PANIC((targ->nb_rxports == 0) && (targ->nb_txports == 0), "L3 task must have a RX or a TX port\n");
152                 // If the L3 sub_mode receives from a port, check that there is at least one core/task
153                 // transmitting to this port in L3 sub_mode
154                 for (uint8_t i = 0; i < targ->nb_rxports; ++i) {
155                         rx_port_id = targ->rx_port_queue[i].port;
156                         ok = 0;
157                         tx_lconf = NULL;
158                         while (core_targ_next(&tx_lconf, &tx_targ, 0) == 0) {
159                                 if ((port_id = tx_targ->tx_port_queue[0].port) == OUT_DISCARD)
160                                         continue;
161                                 if ((rx_port_id == port_id) && (tx_targ->flags & TASK_ARG_L3)){
162                                         ok = 1;
163                                         break;
164                                 }
165                         }
166                         PROX_PANIC(ok == 0, "RX L3 sub mode for port %d on core %d task %d, but no core/task transmitting on that port\n", rx_port_id, lconf->id, targ->id);
167                 }
168
169                 // If the L3 sub_mode transmits to a port, check that there is at least one core/task
170                 // receiving from that port in L3 sub_mode.
171                 if ((port_id = targ->tx_port_queue[0].port) == OUT_DISCARD)
172                         continue;
173                 rx_lconf = NULL;
174                 ok = 0;
175                 plog_info("\tCore %d task %d transmitting to port %d in L3 mode\n", lconf->id, targ->id, port_id);
176                 while (core_targ_next(&rx_lconf, &rx_targ, 0) == 0) {
177                         for (uint8_t i = 0; i < rx_targ->nb_rxports; ++i) {
178                                 rx_port_id = rx_targ->rx_port_queue[i].port;
179                                 if ((rx_port_id == port_id) && (rx_targ->flags & TASK_ARG_L3)){
180                                         ok = 1;
181                                         break;
182                                 }
183                         }
184                         if (ok == 1) {
185                                 plog_info("\tCore %d task %d has found core %d task %d receiving from port %d\n", lconf->id, targ->id, rx_lconf->id, rx_targ->id, port_id);
186                                 break;
187                         }
188                 }
189                 PROX_PANIC(ok == 0, "L3 sub mode for port %d on core %d task %d, but no core/task receiving on that port\n", port_id, lconf->id, targ->id);
190         }
191 }
192
193 static void check_cfg_consistent(void)
194 {
195         check_missing_rx();
196         check_zero_rx();
197         check_mixed_normal_pipeline();
198 }
199
200 static void plog_all_rings(void)
201 {
202         struct lcore_cfg *lconf = NULL;
203         struct task_args *targ;
204
205         while (core_targ_next(&lconf, &targ, 0) == 0) {
206                 for (uint8_t ring_idx = 0; ring_idx < targ->nb_rxrings; ++ring_idx) {
207                         plog_info("\tCore %u, task %u, rx_ring[%u] %p\n", lconf->id, targ->id, ring_idx, targ->rx_rings[ring_idx]);
208                 }
209         }
210 }
211
212 static int chain_flag_state(struct task_args *targ, uint64_t flag, int is_set)
213 {
214         if (task_init_flag_set(targ->task_init, flag) == is_set)
215                 return 1;
216
217         int ret = 0;
218
219         for (uint32_t i = 0; i < targ->n_prev_tasks; ++i) {
220                 ret = chain_flag_state(targ->prev_tasks[i], flag, is_set);
221                 if (ret)
222                         return 1;
223         }
224         return 0;
225 }
226
227 static void configure_if_tx_queues(struct task_args *targ, uint8_t socket)
228 {
229         uint8_t if_port;
230
231         for (uint8_t i = 0; i < targ->nb_txports; ++i) {
232                 if_port = targ->tx_port_queue[i].port;
233
234                 PROX_PANIC(if_port == OUT_DISCARD, "port misconfigured, exiting\n");
235
236                 PROX_PANIC(!prox_port_cfg[if_port].active, "\tPort %u not used, skipping...\n", if_port);
237
238                 int dsocket = prox_port_cfg[if_port].socket;
239                 if (dsocket != -1 && dsocket != socket) {
240                         plog_warn("TX core on socket %d while device on socket %d\n", socket, dsocket);
241                 }
242
243                 if (prox_port_cfg[if_port].tx_ring[0] == '\0') {  // Rings-backed port can use single queue
244                         targ->tx_port_queue[i].queue = prox_port_cfg[if_port].n_txq;
245                         prox_port_cfg[if_port].n_txq++;
246                 } else {
247                         prox_port_cfg[if_port].n_txq = 1;
248                         targ->tx_port_queue[i].queue = 0;
249                 }
250                 /* Set the ETH_TXQ_FLAGS_NOREFCOUNT flag if none of
251                    the tasks up to the task transmitting to the port
252                    use refcnt. */
253                 if (!chain_flag_state(targ, TASK_FEATURE_TXQ_FLAGS_REFCOUNT, 1)) {
254                         prox_port_cfg[if_port].tx_conf.txq_flags |= ETH_TXQ_FLAGS_NOREFCOUNT;
255                 }
256
257                 /* By default OFFLOAD is enabled, but if the whole
258                    chain has NOOFFLOADS set all the way until the
259                    first task that receives from a port, it will be
260                    disabled for the destination port. */
261                 if (!chain_flag_state(targ, TASK_FEATURE_TXQ_FLAGS_NOOFFLOADS, 0)) {
262                         prox_port_cfg[if_port].tx_conf.txq_flags |= ETH_TXQ_FLAGS_NOOFFLOADS;
263                 }
264
265         }
266 }
267
268 static void configure_if_rx_queues(struct task_args *targ, uint8_t socket)
269 {
270         struct prox_port_cfg *port;
271         for (int i = 0; i < targ->nb_rxports; i++) {
272                 uint8_t if_port = targ->rx_port_queue[i].port;
273
274                 if (if_port == OUT_DISCARD) {
275                         return;
276                 }
277
278                 port = &prox_port_cfg[if_port];
279                 PROX_PANIC(!port->active, "Port %u not used, aborting...\n", if_port);
280
281                 if(port->rx_ring[0] != '\0') {
282                         port->n_rxq = 0;
283                 }
284
285                 // If the mbuf size (of the rx task) is not big enough, we might receive multiple segments
286                 // This is usually the case when setting a big mtu size i.e. enabling jumbo frames.
287                 // If the packets get transmitted, then multi segments will have to be enabled on the TX port
288                 uint16_t max_frame_size = port->mtu + ETHER_HDR_LEN + ETHER_CRC_LEN + 2 * PROX_VLAN_TAG_SIZE;
289                 if (max_frame_size + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM > targ->mbuf_size) {
290                         targ->task_init->flag_features |= TASK_FEATURE_TXQ_FLAGS_MULTSEGS;
291                 }
292                 targ->rx_port_queue[i].queue = port->n_rxq;
293                 port->pool[targ->rx_port_queue[i].queue] = targ->pool;
294                 port->pool_size[targ->rx_port_queue[i].queue] = targ->nb_mbuf - 1;
295                 port->n_rxq++;
296
297                 int dsocket = port->socket;
298                 if (dsocket != -1 && dsocket != socket) {
299                         plog_warn("RX core on socket %d while device on socket %d\n", socket, dsocket);
300                 }
301         }
302 }
303
304 static void configure_multi_segments(void)
305 {
306         struct lcore_cfg *lconf = NULL;
307         struct task_args *targ;
308         uint8_t if_port;
309
310         while (core_targ_next(&lconf, &targ, 0) == 0) {
311                 for (uint8_t i = 0; i < targ->nb_txports; ++i) {
312                         if_port = targ->tx_port_queue[i].port;
313                         // Multi segment is disabled for most tasks. It is only enabled for tasks requiring big packets.
314                         // We can only enable "no multi segment" if no such task exists in the chain of tasks.
315                         if (!chain_flag_state(targ, TASK_FEATURE_TXQ_FLAGS_MULTSEGS, 1)) {
316                                 prox_port_cfg[if_port].tx_conf.txq_flags |= ETH_TXQ_FLAGS_NOMULTSEGS;
317                         }
318                 }
319         }
320 }
321
322 static void configure_if_queues(void)
323 {
324         struct lcore_cfg *lconf = NULL;
325         struct task_args *targ;
326         uint8_t socket;
327
328         while (core_targ_next(&lconf, &targ, 0) == 0) {
329                 socket = rte_lcore_to_socket_id(lconf->id);
330
331                 configure_if_rx_queues(targ, socket);
332                 configure_if_tx_queues(targ, socket);
333         }
334 }
335
336 static const char *gen_ring_name(void)
337 {
338         static char retval[] = "XX";
339         static const char* ring_names =
340                 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
341                 "abcdefghijklmnopqrstuvwxyz"
342                 "[\\]^_`!\"#$%&'()*+,-./:;<="
343                 ">?@{|}0123456789";
344         static int idx2 = 0;
345
346         int idx = idx2;
347
348         retval[0] = ring_names[idx % strlen(ring_names)];
349         idx /= strlen(ring_names);
350         retval[1] = idx ? ring_names[(idx - 1) % strlen(ring_names)] : 0;
351
352         idx2++;
353
354         return retval;
355 }
356
357 struct ring_init_stats {
358         uint32_t n_pkt_rings;
359         uint32_t n_ctrl_rings;
360         uint32_t n_opt_rings;
361 };
362
363 static uint32_t ring_init_stats_total(const struct ring_init_stats *ris)
364 {
365         return ris->n_pkt_rings + ris->n_ctrl_rings + ris->n_opt_rings;
366 }
367
368 static uint32_t count_incoming_tasks(uint32_t lcore_worker, uint32_t dest_task)
369 {
370         struct lcore_cfg *lconf = NULL;
371         struct task_args *targ;
372         uint32_t ret = 0;
373         struct core_task ct;
374
375         while (core_targ_next(&lconf, &targ, 0) == 0) {
376                 for (uint8_t idxx = 0; idxx < MAX_PROTOCOLS; ++idxx) {
377                         for (uint8_t ridx = 0; ridx < targ->core_task_set[idxx].n_elems; ++ridx) {
378                                 ct = targ->core_task_set[idxx].core_task[ridx];
379
380                                 if (dest_task == ct.task && lcore_worker == ct.core)
381                                         ret++;
382                         }
383                 }
384         }
385         return ret;
386 }
387
388 static struct rte_ring *get_existing_ring(uint32_t lcore_id, uint32_t task_id)
389 {
390         if (!prox_core_active(lcore_id, 0))
391                 return NULL;
392
393         struct lcore_cfg *lconf = &lcore_cfg[lcore_id];
394
395         if (task_id >= lconf->n_tasks_all)
396                 return NULL;
397
398         if (lconf->targs[task_id].nb_rxrings == 0)
399                 return NULL;
400
401         return lconf->targs[task_id].rx_rings[0];
402 }
403
404 static struct rte_ring *init_ring_between_tasks(struct lcore_cfg *lconf, struct task_args *starg,
405                                     const struct core_task ct, uint8_t ring_idx, int idx,
406                                     struct ring_init_stats *ris)
407 {
408         uint8_t socket;
409         struct rte_ring *ring = NULL;
410         struct lcore_cfg *lworker;
411         struct task_args *dtarg;
412
413         PROX_ASSERT(prox_core_active(ct.core, 0));
414         lworker = &lcore_cfg[ct.core];
415
416         /* socket used is the one that the sending core resides on */
417         socket = rte_lcore_to_socket_id(lconf->id);
418
419         plog_info("\t\tCreating ring on socket %u with size %u\n"
420                   "\t\t\tsource core, task and socket = %u, %u, %u\n"
421                   "\t\t\tdestination core, task and socket = %u, %u, %u\n"
422                   "\t\t\tdestination worker id = %u\n",
423                   socket, starg->ring_size,
424                   lconf->id, starg->id, socket,
425                   ct.core, ct.task, rte_lcore_to_socket_id(ct.core),
426                   ring_idx);
427
428         if (ct.type) {
429                 struct rte_ring **dring = NULL;
430
431                 if (ct.type == CTRL_TYPE_MSG)
432                         dring = &lworker->ctrl_rings_m[ct.task];
433                 else if (ct.type == CTRL_TYPE_PKT) {
434                         dring = &lworker->ctrl_rings_p[ct.task];
435                         starg->flags |= TASK_ARG_CTRL_RINGS_P;
436                 }
437
438                 if (*dring == NULL)
439                         ring = rte_ring_create(gen_ring_name(), starg->ring_size, socket, RING_F_SC_DEQ);
440                 else
441                         ring = *dring;
442                 PROX_PANIC(ring == NULL, "Cannot create ring to connect I/O core %u with worker core %u\n", lconf->id, ct.core);
443
444                 starg->tx_rings[starg->tot_n_txrings_inited] = ring;
445                 starg->tot_n_txrings_inited++;
446                 *dring = ring;
447                 if (lconf->id == prox_cfg.master) {
448                         ctrl_rings[ct.core*MAX_TASKS_PER_CORE + ct.task] = ring;
449                 } else if (ct.core == prox_cfg.master) {
450                         starg->ctrl_plane_ring = ring;
451                 }
452
453                 plog_info("\t\tCore %u task %u to -> core %u task %u ctrl_ring %s %p %s\n",
454                           lconf->id, starg->id, ct.core, ct.task, ct.type == CTRL_TYPE_PKT?
455                           "pkt" : "msg", ring, ring->name);
456                 ris->n_ctrl_rings++;
457                 return ring;
458         }
459
460         dtarg = &lworker->targs[ct.task];
461         lworker->targs[ct.task].worker_thread_id = ring_idx;
462         PROX_ASSERT(dtarg->flags & TASK_ARG_RX_RING);
463         PROX_ASSERT(ct.task < lworker->n_tasks_all);
464
465         /* If all the following conditions are met, the ring can be
466            optimized away. */
467         if (!task_is_master(starg) && !task_is_master(dtarg) && starg->lconf->id == dtarg->lconf->id &&
468             starg->nb_txrings == 1 && idx == 0 && dtarg->task &&
469             dtarg->tot_rxrings == 1 && starg->task == dtarg->task - 1) {
470                 plog_info("\t\tOptimizing away ring on core %u from task %u to task %u\n",
471                           dtarg->lconf->id, starg->task, dtarg->task);
472                 /* No need to set up ws_mbuf. */
473                 starg->tx_opt_ring = 1;
474                 /* During init of destination task, the buffer in the
475                    source task will be initialized. */
476                 dtarg->tx_opt_ring_task = starg;
477                 ris->n_opt_rings++;
478                 ++dtarg->nb_rxrings;
479                 return NULL;
480         }
481
482         int ring_created = 1;
483         /* Only create multi-producer rings if configured to do so AND
484            there is only one task sending to the task */
485         if ((prox_cfg.flags & DSF_MP_RINGS && count_incoming_tasks(ct.core, ct.task) > 1)
486                 || (prox_cfg.flags & DSF_ENABLE_BYPASS)) {
487                 ring = get_existing_ring(ct.core, ct.task);
488
489                 if (ring) {
490                         plog_info("\t\tCore %u task %u creatign MP ring %p to core %u task %u\n",
491                                   lconf->id, starg->id, ring, ct.core, ct.task);
492                         ring_created = 0;
493                 }
494                 else {
495                         ring = rte_ring_create(gen_ring_name(), starg->ring_size, socket, RING_F_SC_DEQ);
496                         plog_info("\t\tCore %u task %u using MP ring %p from core %u task %u\n",
497                                   lconf->id, starg->id, ring, ct.core, ct.task);
498                 }
499         }
500         else
501                 ring = rte_ring_create(gen_ring_name(), starg->ring_size, socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
502
503         PROX_PANIC(ring == NULL, "Cannot create ring to connect I/O core %u with worker core %u\n", lconf->id, ct.core);
504
505         starg->tx_rings[starg->tot_n_txrings_inited] = ring;
506         starg->tot_n_txrings_inited++;
507
508         if (ring_created) {
509                 PROX_ASSERT(dtarg->nb_rxrings < MAX_RINGS_PER_TASK);
510                 dtarg->rx_rings[dtarg->nb_rxrings] = ring;
511                 ++dtarg->nb_rxrings;
512         }
513         dtarg->nb_slave_threads = starg->core_task_set[idx].n_elems;
514         dtarg->lb_friend_core = lconf->id;
515         dtarg->lb_friend_task = starg->id;
516         plog_info("\t\tWorker thread %d has core %d, task %d as a lb friend\n", ct.core, lconf->id, starg->id);
517         plog_info("\t\tCore %u task %u tx_ring[%u] -> core %u task %u rx_ring[%u] %p %s %u WT\n",
518                   lconf->id, starg->id, ring_idx, ct.core, ct.task, dtarg->nb_rxrings, ring, ring->name,
519                   dtarg->nb_slave_threads);
520         ++ris->n_pkt_rings;
521         return ring;
522 }
523
524 static void init_rings(void)
525 {
526         struct lcore_cfg *lconf = NULL;
527         struct task_args *starg;
528         struct ring_init_stats ris = {0};
529
530         while (core_targ_next(&lconf, &starg, 1) == 0) {
531                 plog_info("\t*** Initializing rings on core %u, task %u ***\n", lconf->id, starg->id);
532                 for (uint8_t idx = 0; idx < MAX_PROTOCOLS; ++idx) {
533                         for (uint8_t ring_idx = 0; ring_idx < starg->core_task_set[idx].n_elems; ++ring_idx) {
534                                 PROX_ASSERT(ring_idx < MAX_WT_PER_LB);
535                                 PROX_ASSERT(starg->tot_n_txrings_inited < MAX_RINGS_PER_TASK);
536
537                                 struct core_task ct = starg->core_task_set[idx].core_task[ring_idx];
538                                 init_ring_between_tasks(lconf, starg, ct, ring_idx, idx, &ris);
539                         }
540                 }
541         }
542
543         plog_info("\tInitialized %d rings:\n"
544                   "\t\tNumber of packet rings: %u\n"
545                   "\t\tNumber of control rings: %u\n"
546                   "\t\tNumber of optimized rings: %u\n",
547                   ring_init_stats_total(&ris),
548                   ris.n_pkt_rings,
549                   ris.n_ctrl_rings,
550                   ris.n_opt_rings);
551
552         lconf = NULL;
553         struct prox_port_cfg *port;
554         while (core_targ_next(&lconf, &starg, 1) == 0) {
555                 if ((starg->task_init) && (starg->flags & TASK_ARG_L3)) {
556                         struct core_task ct;
557                         ct.core = prox_cfg.master;
558                         ct.task = 0;
559                         ct.type = CTRL_TYPE_PKT;
560                         struct rte_ring *rx_ring = init_ring_between_tasks(lconf, starg, ct, 0, 0, &ris);
561
562                         ct.core = lconf->id;
563                         ct.task = starg->id;;
564                         struct rte_ring *tx_ring = init_ring_between_tasks(lcore_cfg, lcore_cfg[prox_cfg.master].targs, ct, 0, 0, &ris);
565                 }
566         }
567 }
568
569 static void shuffle_mempool(struct rte_mempool* mempool, uint32_t nb_mbuf)
570 {
571         struct rte_mbuf** pkts = prox_zmalloc(nb_mbuf * sizeof(*pkts), rte_socket_id());
572         uint64_t got = 0;
573
574         while (rte_mempool_get_bulk(mempool, (void**)(pkts + got), 1) == 0)
575                 ++got;
576
577         while (got) {
578                 int idx;
579                 do {
580                         idx = rand() % nb_mbuf - 1;
581                 } while (pkts[idx] == 0);
582
583                 rte_mempool_put_bulk(mempool, (void**)&pkts[idx], 1);
584                 pkts[idx] = 0;
585                 --got;
586         };
587         prox_free(pkts);
588 }
589
590 static void set_mbuf_size(struct task_args *targ)
591 {
592         /* mbuf size can be set
593          *  - from config file (highest priority, overwriting any other config) - should only be used as workaround
594          *  - defaulted to MBUF_SIZE.
595          * Except if set explicitely, ensure that size is big enough for vmxnet3 driver
596          */
597         if (targ->mbuf_size)
598                 return;
599
600         targ->mbuf_size = MBUF_SIZE;
601         struct prox_port_cfg *port;
602         uint16_t max_frame_size = 0, min_buffer_size = 0;
603         int i40e = 0;
604         for (int i = 0; i < targ->nb_rxports; i++) {
605                 uint8_t if_port = targ->rx_port_queue[i].port;
606
607                 if (if_port == OUT_DISCARD) {
608                         continue;
609                 }
610                 port = &prox_port_cfg[if_port];
611                 if (max_frame_size < port->mtu + ETHER_HDR_LEN + ETHER_CRC_LEN + 2 * PROX_VLAN_TAG_SIZE)
612                         max_frame_size = port->mtu + ETHER_HDR_LEN + ETHER_CRC_LEN + 2 * PROX_VLAN_TAG_SIZE;
613                 if (min_buffer_size < port->min_rx_bufsize)
614                         min_buffer_size = port->min_rx_bufsize;
615
616                 // Check whether we receive from i40e. This driver have extra mbuf size requirements
617                 if (strcmp(port->short_name, "i40e") == 0)
618                         i40e = 1;
619         }
620         if (i40e) {
621                 // i40e supports a maximum of 5 descriptors chained
622                 uint16_t required_mbuf_size = RTE_ALIGN(max_frame_size / 5, 128) + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM;
623                 if (required_mbuf_size > targ->mbuf_size) {
624                         targ->mbuf_size = required_mbuf_size;
625                         plog_info("\t\tSetting mbuf_size to %u to support frame_size %u\n", targ->mbuf_size, max_frame_size);
626                 }
627         }
628         if (min_buffer_size > targ->mbuf_size) {
629                 plog_warn("Mbuf size might be too small. This might result in packet segmentation and memory leak\n");
630         }
631
632 }
633
634 static void setup_mempools_unique_per_socket(void)
635 {
636         uint32_t flags = 0;
637         char name[64];
638         struct lcore_cfg *lconf = NULL;
639         struct task_args *targ;
640
641         struct rte_mempool     *pool[MAX_SOCKETS];
642         uint32_t mbuf_count[MAX_SOCKETS] = {0};
643         uint32_t nb_cache_mbuf[MAX_SOCKETS] = {0};
644         uint32_t mbuf_size[MAX_SOCKETS] = {0};
645
646         while (core_targ_next_early(&lconf, &targ, 0) == 0) {
647                 PROX_PANIC(targ->task_init == NULL, "task_init = NULL, is mode specified for core %d, task %d ?\n", lconf->id, targ->id);
648                 uint8_t socket = rte_lcore_to_socket_id(lconf->id);
649                 PROX_ASSERT(socket < MAX_SOCKETS);
650
651                 set_mbuf_size(targ);
652                 if (targ->rx_port_queue[0].port != OUT_DISCARD) {
653                         struct prox_port_cfg* port_cfg = &prox_port_cfg[targ->rx_port_queue[0].port];
654                         PROX_ASSERT(targ->nb_mbuf != 0);
655                         mbuf_count[socket] += targ->nb_mbuf;
656                         if (nb_cache_mbuf[socket] == 0)
657                                 nb_cache_mbuf[socket] = targ->nb_cache_mbuf;
658                         else {
659                                 PROX_PANIC(nb_cache_mbuf[socket] != targ->nb_cache_mbuf,
660                                            "all mbuf_cache must have the same size if using a unique mempool per socket\n");
661                         }
662                         if (mbuf_size[socket] == 0)
663                                 mbuf_size[socket] = targ->mbuf_size;
664                         else {
665                                 PROX_PANIC(mbuf_size[socket] != targ->mbuf_size,
666                                            "all mbuf_size must have the same size if using a unique mempool per socket\n");
667                         }
668                 }
669         }
670         for (int i = 0 ; i < MAX_SOCKETS; i++) {
671                 if (mbuf_count[i] != 0) {
672                         sprintf(name, "socket_%u_pool", i);
673                         pool[i] = rte_mempool_create(name,
674                                                      mbuf_count[i] - 1, mbuf_size[i],
675                                                      nb_cache_mbuf[i],
676                                                      sizeof(struct rte_pktmbuf_pool_private),
677                                                      rte_pktmbuf_pool_init, NULL,
678                                                      prox_pktmbuf_init, NULL,
679                                                      i, flags);
680                         PROX_PANIC(pool[i] == NULL, "\t\tError: cannot create mempool for socket %u\n", i);
681                         plog_info("\t\tMempool %p size = %u * %u cache %u, socket %d\n", pool[i],
682                                   mbuf_count[i], mbuf_size[i], nb_cache_mbuf[i], i);
683
684                         if (prox_cfg.flags & DSF_SHUFFLE) {
685                                 shuffle_mempool(pool[i], mbuf_count[i]);
686                         }
687                 }
688         }
689
690         lconf = NULL;
691         while (core_targ_next_early(&lconf, &targ, 0) == 0) {
692                 uint8_t socket = rte_lcore_to_socket_id(lconf->id);
693
694                 if (targ->rx_port_queue[0].port != OUT_DISCARD) {
695                         /* use this pool for the interface that the core is receiving from */
696                         /* If one core receives from multiple ports, all the ports use the same mempool */
697                         targ->pool = pool[socket];
698                         /* Set the number of mbuf to the number of the unique mempool, so that the used and free work */
699                         targ->nb_mbuf = mbuf_count[socket];
700                         plog_info("\t\tMempool %p size = %u * %u cache %u, socket %d\n", targ->pool,
701                                   targ->nb_mbuf, mbuf_size[socket], targ->nb_cache_mbuf, socket);
702                 }
703         }
704 }
705
706 static void setup_mempool_for_rx_task(struct lcore_cfg *lconf, struct task_args *targ)
707 {
708         const uint8_t socket = rte_lcore_to_socket_id(lconf->id);
709         struct prox_port_cfg *port_cfg = &prox_port_cfg[targ->rx_port_queue[0].port];
710         const struct rte_memzone *mz;
711         struct rte_mempool *mp = NULL;
712         uint32_t flags = 0;
713         char memzone_name[64];
714         char name[64];
715
716         set_mbuf_size(targ);
717
718         /* allocate memory pool for packets */
719         PROX_ASSERT(targ->nb_mbuf != 0);
720
721         if (targ->pool_name[0] == '\0') {
722                 sprintf(name, "core_%u_port_%u_pool", lconf->id, targ->id);
723         }
724
725         snprintf(memzone_name, sizeof(memzone_name)-1, "MP_%s", targ->pool_name);
726         mz = rte_memzone_lookup(memzone_name);
727
728         if (mz != NULL) {
729                 mp = (struct rte_mempool*)mz->addr;
730
731                 targ->nb_mbuf = mp->size;
732                 targ->pool = mp;
733         }
734
735 #ifdef RTE_LIBRTE_IVSHMEM_FALSE
736         if (mz != NULL && mp != NULL && mp->phys_addr != mz->ioremap_addr) {
737                 /* Init mbufs with ioremap_addr for dma */
738                 mp->phys_addr = mz->ioremap_addr;
739                 mp->elt_pa[0] = mp->phys_addr + (mp->elt_va_start - (uintptr_t)mp);
740
741                 struct prox_pktmbuf_reinit_args init_args;
742                 init_args.mp = mp;
743                 init_args.lconf = lconf;
744
745                 uint32_t elt_sz = mp->elt_size + mp->header_size + mp->trailer_size;
746                 rte_mempool_obj_iter((void*)mp->elt_va_start, mp->size, elt_sz, 1,
747                                      mp->elt_pa, mp->pg_num, mp->pg_shift, prox_pktmbuf_reinit, &init_args);
748         }
749 #endif
750
751         /* Use this pool for the interface that the core is
752            receiving from if one core receives from multiple
753            ports, all the ports use the same mempool */
754         if (targ->pool == NULL) {
755                 plog_info("\t\tCreating mempool with name '%s'\n", name);
756                 targ->pool = rte_mempool_create(name,
757                                                 targ->nb_mbuf - 1, targ->mbuf_size,
758                                                 targ->nb_cache_mbuf,
759                                                 sizeof(struct rte_pktmbuf_pool_private),
760                                                 rte_pktmbuf_pool_init, NULL,
761                                                 prox_pktmbuf_init, lconf,
762                                                 socket, flags);
763         }
764
765         PROX_PANIC(targ->pool == NULL,
766                    "\t\tError: cannot create mempool for core %u port %u: %s\n", lconf->id, targ->id, rte_strerror(rte_errno));
767
768         plog_info("\t\tMempool %p size = %u * %u cache %u, socket %d\n", targ->pool,
769                   targ->nb_mbuf, targ->mbuf_size, targ->nb_cache_mbuf, socket);
770         if (prox_cfg.flags & DSF_SHUFFLE) {
771                 shuffle_mempool(targ->pool, targ->nb_mbuf);
772         }
773 }
774
775 static void setup_mempools_multiple_per_socket(void)
776 {
777         struct lcore_cfg *lconf = NULL;
778         struct task_args *targ;
779
780         while (core_targ_next_early(&lconf, &targ, 0) == 0) {
781                 PROX_PANIC(targ->task_init == NULL, "task_init = NULL, is mode specified for core %d, task %d ?\n", lconf->id, targ->id);
782                 if (targ->rx_port_queue[0].port == OUT_DISCARD)
783                         continue;
784                 setup_mempool_for_rx_task(lconf, targ);
785         }
786 }
787
788 static void setup_mempools(void)
789 {
790         if (prox_cfg.flags & UNIQUE_MEMPOOL_PER_SOCKET)
791                 setup_mempools_unique_per_socket();
792         else
793                 setup_mempools_multiple_per_socket();
794 }
795
796 static void set_task_lconf(void)
797 {
798         struct lcore_cfg *lconf;
799         uint32_t lcore_id = -1;
800
801         while(prox_core_next(&lcore_id, 1) == 0) {
802                 lconf = &lcore_cfg[lcore_id];
803                 for (uint8_t task_id = 0; task_id < lconf->n_tasks_all; ++task_id) {
804                         lconf->targs[task_id].lconf = lconf;
805                 }
806         }
807 }
808
809 static void set_dest_threads(void)
810 {
811         struct lcore_cfg *lconf = NULL;
812         struct task_args *targ;
813
814         while (core_targ_next(&lconf, &targ, 0) == 0) {
815                 for (uint8_t idx = 0; idx < MAX_PROTOCOLS; ++idx) {
816                         for (uint8_t ring_idx = 0; ring_idx < targ->core_task_set[idx].n_elems; ++ring_idx) {
817                                 struct core_task ct = targ->core_task_set[idx].core_task[ring_idx];
818
819                                 struct task_args *dest_task = core_targ_get(ct.core, ct.task);
820                                 dest_task->prev_tasks[dest_task->n_prev_tasks++] = targ;
821                         }
822                 }
823         }
824 }
825
826 static void setup_all_task_structs_early_init(void)
827 {
828         struct lcore_cfg *lconf = NULL;
829         struct task_args *targ;
830
831         plog_info("\t*** Calling early init on all tasks ***\n");
832         while (core_targ_next(&lconf, &targ, 0) == 0) {
833                 if (targ->task_init->early_init) {
834                         targ->task_init->early_init(targ);
835                 }
836         }
837 }
838
839 static void setup_all_task_structs(void)
840 {
841         struct lcore_cfg *lconf;
842         uint32_t lcore_id = -1;
843         struct task_base *tmaster = NULL;
844
845         while(prox_core_next(&lcore_id, 1) == 0) {
846                 lconf = &lcore_cfg[lcore_id];
847                 for (uint8_t task_id = 0; task_id < lconf->n_tasks_all; ++task_id) {
848                         if (task_is_master(&lconf->targs[task_id])) {
849                                 plog_info("\tInitializing MASTER struct for core %d task %d\n", lcore_id, task_id);
850                                 lconf->tasks_all[task_id] = init_task_struct(&lconf->targs[task_id]);
851                                 tmaster = lconf->tasks_all[task_id];
852                         }
853                 }
854         }
855         PROX_PANIC(tmaster == NULL, "Can't initialize master task\n");
856         lcore_id = -1;
857
858         while(prox_core_next(&lcore_id, 1) == 0) {
859                 lconf = &lcore_cfg[lcore_id];
860                 plog_info("\tInitializing struct for core %d with %d task\n", lcore_id, lconf->n_tasks_all);
861                 for (uint8_t task_id = 0; task_id < lconf->n_tasks_all; ++task_id) {
862                         if (!task_is_master(&lconf->targs[task_id])) {
863                                 plog_info("\tInitializing struct for core %d task %d\n", lcore_id, task_id);
864                                 lconf->targs[task_id].tmaster = tmaster;
865                                 lconf->tasks_all[task_id] = init_task_struct(&lconf->targs[task_id]);
866                         }
867                 }
868         }
869 }
870
871 static void init_port_activate(void)
872 {
873         struct lcore_cfg *lconf = NULL;
874         struct task_args *targ;
875         uint8_t port_id = 0;
876
877         while (core_targ_next_early(&lconf, &targ, 0) == 0) {
878                 for (int i = 0; i < targ->nb_rxports; i++) {
879                         port_id = targ->rx_port_queue[i].port;
880                         prox_port_cfg[port_id].active = 1;
881                 }
882
883                 for (int i = 0; i < targ->nb_txports; i++) {
884                         port_id = targ->tx_port_queue[i].port;
885                         prox_port_cfg[port_id].active = 1;
886                 }
887         }
888 }
889
890 /* Initialize cores and allocate mempools */
891 static void init_lcores(void)
892 {
893         struct lcore_cfg *lconf = 0;
894         uint32_t lcore_id = -1;
895
896         while(prox_core_next(&lcore_id, 0) == 0) {
897                 uint8_t socket = rte_lcore_to_socket_id(lcore_id);
898                 PROX_PANIC(socket + 1 > MAX_SOCKETS, "Can't configure core %u (on socket %u). MAX_SOCKET is set to %d\n", lcore_id, socket, MAX_SOCKETS);
899         }
900
901         /* need to allocate mempools as the first thing to use the lowest possible address range */
902         plog_info("=== Initializing mempools ===\n");
903         setup_mempools();
904
905         lcore_cfg_alloc_hp();
906
907         set_dest_threads();
908         set_task_lconf();
909
910         plog_info("=== Initializing port addresses ===\n");
911         init_port_addr();
912
913         plog_info("=== Initializing queue numbers on cores ===\n");
914         configure_if_queues();
915
916         configure_multi_segments();
917
918         plog_info("=== Initializing rings on cores ===\n");
919         init_rings();
920
921         plog_info("=== Checking configuration consistency ===\n");
922         check_cfg_consistent();
923
924         plog_all_rings();
925
926         setup_all_task_structs_early_init();
927         plog_info("=== Initializing tasks ===\n");
928         setup_all_task_structs();
929 }
930
931 static int setup_prox(int argc, char **argv)
932 {
933         if (prox_read_config_file() != 0 ||
934             prox_setup_rte(argv[0]) != 0) {
935                 return -1;
936         }
937
938         if (prox_cfg.flags & DSF_CHECK_SYNTAX) {
939                 plog_info("=== Configuration file syntax has been checked ===\n\n");
940                 exit(EXIT_SUCCESS);
941         }
942
943         init_port_activate();
944         plog_info("=== Initializing rte devices ===\n");
945         if (!(prox_cfg.flags & DSF_USE_DUMMY_DEVICES))
946                 init_rte_ring_dev();
947         init_rte_dev(prox_cfg.flags & DSF_USE_DUMMY_DEVICES);
948         plog_info("=== Calibrating TSC overhead ===\n");
949         clock_init();
950         plog_info("\tTSC running at %"PRIu64" Hz\n", rte_get_tsc_hz());
951
952         init_lcores();
953         plog_info("=== Initializing ports ===\n");
954         init_port_all();
955
956         if (prox_cfg.logbuf_size) {
957                 prox_cfg.logbuf = prox_zmalloc(prox_cfg.logbuf_size, rte_socket_id());
958                 PROX_PANIC(prox_cfg.logbuf == NULL, "Failed to allocate memory for logbuf with size = %d\n", prox_cfg.logbuf_size);
959         }
960
961         if (prox_cfg.flags & DSF_CHECK_INIT) {
962                 plog_info("=== Initialization sequence completed ===\n\n");
963                 exit(EXIT_SUCCESS);
964         }
965
966         /* Current way that works to disable DPDK logging */
967         FILE *f = fopen("/dev/null", "r");
968         rte_openlog_stream(f);
969         plog_info("=== PROX started ===\n");
970         return 0;
971 }
972
973 static int success = 0;
974 static void siguser_handler(int signal)
975 {
976         if (signal == SIGUSR1)
977                 success = 1;
978         else
979                 success = 0;
980 }
981
982 static void sigabrt_handler(__attribute__((unused)) int signum)
983 {
984         /* restore default disposition for SIGABRT and SIGPIPE */
985         signal(SIGABRT, SIG_DFL);
986         signal(SIGPIPE, SIG_DFL);
987
988         /* ignore further Ctrl-C */
989         signal(SIGINT, SIG_IGN);
990
991         /* more drastic exit on tedious termination signal */
992         plog_info("Aborting...\n");
993         if (lcore_cfg != NULL) {
994                 uint32_t lcore_id;
995                 pthread_t thread_id, tid0, tid = pthread_self();
996                 memset(&tid0, 0, sizeof(tid0));
997
998                 /* cancel all threads except current one */
999                 lcore_id = -1;
1000                 while (prox_core_next(&lcore_id, 1) == 0) {
1001                         thread_id = lcore_cfg[lcore_id].thread_id;
1002                         if (pthread_equal(thread_id, tid0))
1003                                 continue;
1004                         if (pthread_equal(thread_id, tid))
1005                                 continue;
1006                         pthread_cancel(thread_id);
1007                 }
1008
1009                 /* wait for cancelled threads to terminate */
1010                 lcore_id = -1;
1011                 while (prox_core_next(&lcore_id, 1) == 0) {
1012                         thread_id = lcore_cfg[lcore_id].thread_id;
1013                         if (pthread_equal(thread_id, tid0))
1014                                 continue;
1015                         if (pthread_equal(thread_id, tid))
1016                                 continue;
1017                         pthread_join(thread_id, NULL);
1018                 }
1019         }
1020
1021         /* close ncurses */
1022         display_end();
1023
1024         /* close ports on termination signal */
1025         close_ports_atexit();
1026
1027         /* terminate now */
1028         abort();
1029 }
1030
1031 static void sigterm_handler(int signum)
1032 {
1033         /* abort on second Ctrl-C */
1034         if (signum == SIGINT)
1035                 signal(SIGINT, sigabrt_handler);
1036
1037         /* gracefully quit on harmless termination signal */
1038         /* ports will subsequently get closed at resulting exit */
1039         quit();
1040 }
1041
1042 int main(int argc, char **argv)
1043 {
1044         /* set en_US locale to print big numbers with ',' */
1045         setlocale(LC_NUMERIC, "en_US.utf-8");
1046
1047         if (prox_parse_args(argc, argv) != 0){
1048                 prox_usage(argv[0]);
1049         }
1050
1051         plog_init(prox_cfg.log_name, prox_cfg.log_name_pid);
1052         plog_info("=== " PROGRAM_NAME " " VERSION_STR " ===\n");
1053         plog_info("\tUsing DPDK %s\n", rte_version() + sizeof(RTE_VER_PREFIX));
1054         read_rdt_info();
1055
1056         if (prox_cfg.flags & DSF_LIST_TASK_MODES) {
1057                 /* list supported task modes and exit */
1058                 tasks_list();
1059                 return EXIT_SUCCESS;
1060         }
1061
1062         /* close ports at normal exit */
1063         atexit(close_ports_atexit);
1064         /* gracefully quit on harmless termination signals */
1065         signal(SIGHUP, sigterm_handler);
1066         signal(SIGINT, sigterm_handler);
1067         signal(SIGQUIT, sigterm_handler);
1068         signal(SIGTERM, sigterm_handler);
1069         signal(SIGUSR1, sigterm_handler);
1070         signal(SIGUSR2, sigterm_handler);
1071         /* more drastic exit on tedious termination signals */
1072         signal(SIGABRT, sigabrt_handler);
1073         signal(SIGPIPE, sigabrt_handler);
1074
1075         if (prox_cfg.flags & DSF_DAEMON) {
1076                 signal(SIGUSR1, siguser_handler);
1077                 signal(SIGUSR2, siguser_handler);
1078                 plog_info("=== Running in Daemon mode ===\n");
1079                 plog_info("\tForking child and waiting for setup completion\n");
1080
1081                 pid_t ppid = getpid();
1082                 pid_t pid = fork();
1083                 if (pid < 0) {
1084                         plog_err("Failed to fork process to run in daemon mode\n");
1085                         return EXIT_FAILURE;
1086                 }
1087
1088                 if (pid == 0) {
1089                         fclose(stdin);
1090                         fclose(stdout);
1091                         fclose(stderr);
1092                         if (setsid() < 0) {
1093                                 kill(ppid, SIGUSR2);
1094                                 return EXIT_FAILURE;
1095                         }
1096                         if (setup_prox(argc, argv) != 0) {
1097                                 kill(ppid, SIGUSR2);
1098                                 return EXIT_FAILURE;
1099                         }
1100                         else {
1101                                 kill(ppid, SIGUSR1);
1102                                 run(prox_cfg.flags);
1103                                 return EXIT_SUCCESS;
1104                         }
1105                 }
1106                 else {
1107                         /* Before exiting the parent, wait until the
1108                            child process has finished setting up */
1109                         pause();
1110                         if (prox_cfg.logbuf) {
1111                                 file_print(prox_cfg.logbuf);
1112                         }
1113                         return success? EXIT_SUCCESS : EXIT_FAILURE;
1114                 }
1115         }
1116
1117         if (setup_prox(argc, argv) != 0)
1118                 return EXIT_FAILURE;
1119         run(prox_cfg.flags);
1120         return EXIT_SUCCESS;
1121 }