2 // Copyright (c) 2010-2020 Intel Corporation
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
17 #include <rte_lcore.h>
19 #include <rte_hash_crc.h>
22 #include "task_base.h"
27 #include "handle_master.h"
28 #include "prox_port_cfg.h"
29 #include "packet_utils.h"
30 #include "prox_shared.h"
32 #include "hash_entry_types.h"
33 #include "prox_compat.h"
36 #include "prox_ipv6.h"
39 static inline int find_ip(struct ether_hdr_arp *pkt, uint16_t len, uint32_t *ip_dst, uint16_t *vlan)
41 prox_rte_vlan_hdr *vlan_hdr;
42 prox_rte_ether_hdr *eth_hdr = (prox_rte_ether_hdr*)pkt;
43 prox_rte_ipv4_hdr *ip;
44 uint16_t ether_type = eth_hdr->ether_type;
45 uint16_t l2_len = sizeof(prox_rte_ether_hdr);
49 while (((ether_type == ETYPE_8021ad) || (ether_type == ETYPE_VLAN)) && (l2_len + sizeof(prox_rte_vlan_hdr) < len)) {
50 vlan_hdr = (prox_rte_vlan_hdr *)((uint8_t *)pkt + l2_len);
52 ether_type = vlan_hdr->eth_proto;
53 *vlan = rte_be_to_cpu_16(vlan_hdr->vlan_tci & 0xFF0F); // Store VLAN, or CVLAN if QinQ
59 // In case of MPLS, next hop MAC is based on MPLS, not destination IP
71 plog_warn("Unsupported packet type %x - CRC might be wrong\n", ether_type);
75 if (l2_len && (l2_len + sizeof(prox_rte_ipv4_hdr) <= len)) {
76 prox_rte_ipv4_hdr *ip = (prox_rte_ipv4_hdr *)((uint8_t *)pkt + l2_len);
77 // TODO: implement LPM => replace ip_dst by next hop IP DST
78 *ip_dst = ip->dst_addr;
84 static inline void find_vlan(struct ether_hdr_arp *pkt, uint16_t len, uint16_t *vlan)
86 prox_rte_vlan_hdr *vlan_hdr;
87 prox_rte_ether_hdr *eth_hdr = (prox_rte_ether_hdr*)pkt;
88 uint16_t ether_type = eth_hdr->ether_type;
89 uint16_t l2_len = sizeof(prox_rte_ether_hdr);
93 while (((ether_type == ETYPE_8021ad) || (ether_type == ETYPE_VLAN)) && (l2_len + sizeof(prox_rte_vlan_hdr) < len)) {
94 vlan_hdr = (prox_rte_vlan_hdr *)((uint8_t *)pkt + l2_len);
96 ether_type = vlan_hdr->eth_proto;
97 *vlan = rte_be_to_cpu_16(vlan_hdr->vlan_tci & 0xFF0F); // Store VLAN, or CVLAN if QinQ
101 static inline struct ipv6_addr *find_ip6(prox_rte_ether_hdr *pkt, uint16_t len, struct ipv6_addr *ip_dst, uint16_t *vlan)
103 prox_rte_vlan_hdr *vlan_hdr;
104 prox_rte_ipv6_hdr *ip;
105 uint16_t ether_type = pkt->ether_type;
106 uint16_t l2_len = sizeof(prox_rte_ether_hdr);
110 while (((ether_type == ETYPE_8021ad) || (ether_type == ETYPE_VLAN)) && (l2_len + sizeof(prox_rte_vlan_hdr) < len)) {
111 vlan_hdr = (prox_rte_vlan_hdr *)((uint8_t *)pkt + l2_len);
113 ether_type = vlan_hdr->eth_proto;
114 *vlan = rte_be_to_cpu_16(vlan_hdr->vlan_tci & 0xFF0F); // Store VLAN, or CVLAN if QinQ
117 switch (ether_type) {
120 // In case of MPLS, next hop MAC is based on MPLS, not destination IP
132 plog_warn("Unsupported packet type %x - CRC might be wrong\n", ether_type);
136 if (l2_len && (l2_len + sizeof(prox_rte_ipv6_hdr) <= len)) {
137 prox_rte_ipv6_hdr *ip = (prox_rte_ipv6_hdr *)((uint8_t *)pkt + l2_len);
138 // TODO: implement LPM => replace ip_dst by next hop IP DST
139 memcpy(ip_dst, &ip->dst_addr, sizeof(struct ipv6_addr));
140 return (struct ipv6_addr *)&ip->src_addr;
145 static void send_unsollicited_neighbour_advertisement(struct task_base *tbase, struct task_args *targ)
148 uint8_t out = 0, port_id = tbase->l3.reachable_port_id;
149 struct rte_mbuf *mbuf;
151 ret = rte_mempool_get(tbase->l3.arp_nd_pool, (void **)&mbuf);
152 if (likely(ret == 0)) {
153 mbuf->port = port_id;
154 build_neighbour_advertisement(tbase->l3.tmaster, mbuf, &prox_port_cfg[port_id].eth_addr, &targ->local_ipv6, PROX_UNSOLLICITED);
155 tbase->aux->tx_ctrlplane_pkt(tbase, &mbuf, 1, &out);
156 TASK_STATS_ADD_TX_NON_DP(&tbase->aux->stats, 1);
158 plog_err("Failed to get a mbuf from arp/ndp mempool\n");
162 static void send_router_sollicitation(struct task_base *tbase, struct task_args *targ)
165 uint8_t out = 0, port_id = tbase->l3.reachable_port_id;
166 struct rte_mbuf *mbuf;
168 ret = rte_mempool_get(tbase->l3.arp_nd_pool, (void **)&mbuf);
169 if (likely(ret == 0)) {
170 mbuf->port = port_id;
171 build_router_sollicitation(mbuf, &prox_port_cfg[port_id].eth_addr, &targ->local_ipv6);
172 tbase->aux->tx_ctrlplane_pkt(tbase, &mbuf, 1, &out);
173 TASK_STATS_ADD_TX_NON_DP(&tbase->aux->stats, 1);
175 plog_err("Failed to get a mbuf from arp/ndp mempool\n");
179 /* This implementation could be improved: instead of checking each time we send a packet whether we need also
180 to send an ARP, we should only check whether the MAC is valid.
181 We should check arp_ndp_retransmit_timeout in the master process. This would also require the generating task to clear its arp ring
182 to avoid sending many ARP while starting after a long stop.
183 We could also check for reachable_timeout in the master so that dataplane has only to check whether MAC is available
184 but this would require either thread safety, or the the exchange of information between master and generating core.
187 static inline int add_key_and_send_arp(struct rte_hash *ip_hash, uint32_t *ip_dst, struct arp_table *entries, uint64_t tsc, uint64_t hz, uint32_t arp_ndp_retransmit_timeout, prox_next_hop_index_type nh, uint64_t **time)
189 int ret = rte_hash_add_key(ip_hash, (const void *)ip_dst);
190 if (unlikely(ret < 0)) {
191 // No reason to send ARP, as reply would be anyhow ignored
192 plogx_err("Unable to add ip "IPv4_BYTES_FMT" in mac_hash\n", IP4(*ip_dst));
195 entries[ret].ip = *ip_dst;
196 entries[ret].nh = nh;
197 *time = &entries[ret].arp_ndp_retransmit_timeout;
202 static inline int update_mac_and_send_mbuf(struct arp_table *entry, prox_rte_ether_addr *mac, uint64_t tsc, uint64_t hz, uint32_t arp_ndp_retransmit_timeout, uint64_t **time)
204 if (likely((tsc < entry->arp_ndp_retransmit_timeout) && (tsc < entry->reachable_timeout))) {
205 memcpy(mac, &entry->mac, sizeof(prox_rte_ether_addr));
207 } else if (tsc > entry->arp_ndp_retransmit_timeout) {
208 // long time since we have sent an arp, send arp
209 *time = &entry->arp_ndp_retransmit_timeout;
210 if (tsc < entry->reachable_timeout){
211 // MAC is valid in the table => send also the mbuf
212 memcpy(mac, &entry->mac, sizeof(prox_rte_ether_addr));
213 return SEND_MBUF_AND_ARP_ND;
215 // MAC still unknown, or timed out => only send ARP
219 // MAC is unknown and we already sent an ARP recently, drop mbuf and wait for ARP reply
223 int write_dst_mac(struct task_base *tbase, struct rte_mbuf *mbuf, uint32_t *ip_dst, uint16_t *vlan, uint64_t **time, uint64_t tsc)
225 const uint64_t hz = rte_get_tsc_hz();
226 struct ether_hdr_arp *packet = rte_pktmbuf_mtod(mbuf, struct ether_hdr_arp *);
227 prox_rte_ether_addr *mac = &packet->ether_hdr.d_addr;
228 prox_next_hop_index_type next_hop_index;
229 static uint64_t last_tsc = 0, n_no_route = 0;
231 struct l3_base *l3 = &(tbase->l3);
233 // First find the next hop
235 // A routing table was configured
236 // If a gw (gateway_ipv4) is also specified, it is used as default gw only i.e. lowest priority (shortest prefix)
237 // This is implemented automatically through lpm
238 uint16_t len = rte_pktmbuf_pkt_len(mbuf);
239 if (find_ip(packet, len, ip_dst, vlan) != 0) {
240 // Unable to find IP address => non IP packet => send it as it
243 if (unlikely(rte_lpm_lookup(l3->ipv4_lpm, rte_bswap32(*ip_dst), &next_hop_index) != 0)) {
244 // Prevent printing too many messages
246 if (tsc > last_tsc + rte_get_tsc_hz()) {
247 plogx_err("No route to IP "IPv4_BYTES_FMT" (%ld times)\n", IP4(*ip_dst), n_no_route);
253 struct arp_table *entry = &l3->next_hops[next_hop_index];
257 return update_mac_and_send_mbuf(entry, mac, tsc, hz, l3->arp_ndp_retransmit_timeout, time);
260 // no next ip: this is a local route
261 // Find IP in lookup table. Send ARP if not found
262 int ret = rte_hash_lookup(l3->ip_hash, (const void *)ip_dst);
263 if (unlikely(ret < 0)) {
264 // IP not found, try to send an ARP
265 return add_key_and_send_arp(l3->ip_hash, ip_dst, l3->arp_table, tsc, hz, l3->arp_ndp_retransmit_timeout, MAX_HOP_INDEX, time);
267 return update_mac_and_send_mbuf(&l3->arp_table[ret], mac, tsc, hz, l3->arp_ndp_retransmit_timeout, time);
271 // No Routing table specified: only a local ip and maybe a gateway
272 // Old default behavior: if a gw is specified, ALL packets go to this gateway (even those we could send w/o the gw
274 uint16_t len = rte_pktmbuf_pkt_len(mbuf);
276 find_vlan(packet, len, vlan);
277 if (likely((l3->flags & FLAG_DST_MAC_KNOWN) && (tsc < l3->gw.arp_ndp_retransmit_timeout) && (tsc < l3->gw.reachable_timeout))) {
278 memcpy(mac, &l3->gw.mac, sizeof(prox_rte_ether_addr));
280 } else if (tsc > l3->gw.arp_ndp_retransmit_timeout) {
281 // long time since we have successfully sent an arp, send arp
282 // If sending ARP failed (ring full) then arp_ndp_retransmit_timeout is not updated to avoid having to wait 1 sec to send ARP REQ again
283 *time = &l3->gw.arp_ndp_retransmit_timeout;
284 l3->gw.arp_ndp_retransmit_timeout = tsc + l3->arp_ndp_retransmit_timeout * hz / 1000;
287 if ((l3->flags & FLAG_DST_MAC_KNOWN) && (tsc < l3->gw.reachable_timeout)){
288 // MAC is valid in the table => send also the mbuf
289 memcpy(mac, &l3->gw.mac, sizeof(prox_rte_ether_addr));
290 return SEND_MBUF_AND_ARP_ND;
292 // MAC still unknown, or timed out => only send ARP
296 // MAC is unknown and we already sent an ARP recently, drop mbuf and wait for ARP reply
301 if (find_ip(packet, len, ip_dst, vlan) != 0) {
302 // Unable to find IP address => non IP packet => send it as it
305 if (likely(l3->n_pkts < 4)) {
306 for (unsigned int idx = 0; idx < l3->n_pkts; idx++) {
307 if (*ip_dst == l3->optimized_arp_table[idx].ip) {
308 return update_mac_and_send_mbuf(&l3->optimized_arp_table[idx], mac, tsc, hz, l3->arp_ndp_retransmit_timeout, time);
311 // IP address not found in table
312 l3->optimized_arp_table[l3->n_pkts].ip = *ip_dst;
313 *time = &l3->optimized_arp_table[l3->n_pkts].arp_ndp_retransmit_timeout;
316 if (l3->n_pkts < 4) {
320 // We have too many IP addresses to search linearly; lets use hash table instead => copy all entries in hash table
321 for (uint32_t idx = 0; idx < l3->n_pkts; idx++) {
322 uint32_t ip = l3->optimized_arp_table[idx].ip;
323 int ret = rte_hash_add_key(l3->ip_hash, (const void *)&ip);
325 // This should not happen as few entries so far.
326 // If it happens, we still send the ARP as easier:
327 // If the ARP corresponds to this error, the ARP reply will be ignored
328 // If ARP does not correspond to this error/ip, then ARP reply will be handled.
329 plogx_err("Unable add ip "IPv4_BYTES_FMT" in mac_hash (already %d entries)\n", IP4(ip), idx);
331 memcpy(&l3->arp_table[ret], &l3->optimized_arp_table[idx], sizeof(struct arp_table));
336 // Find IP in lookup table. Send ARP if not found
337 int ret = rte_hash_lookup(l3->ip_hash, (const void *)ip_dst);
338 if (unlikely(ret < 0)) {
339 // IP not found, try to send an ARP
340 return add_key_and_send_arp(l3->ip_hash, ip_dst, &l3->arp_table[ret], tsc, hz, l3->arp_ndp_retransmit_timeout, MAX_HOP_INDEX, time);
343 return update_mac_and_send_mbuf(&l3->arp_table[ret], mac, tsc, hz, l3->arp_ndp_retransmit_timeout, time);
350 int write_ip6_dst_mac(struct task_base *tbase, struct rte_mbuf *mbuf, struct ipv6_addr *ip_dst, uint16_t *vlan)
352 const uint64_t hz = rte_get_tsc_hz();
353 prox_rte_ether_hdr *packet = rte_pktmbuf_mtod(mbuf, prox_rte_ether_hdr *);
354 prox_rte_ether_addr *mac = &packet->d_addr;
355 struct ipv6_addr *used_ip_src;
357 uint64_t tsc = rte_rdtsc();
358 uint16_t len = rte_pktmbuf_pkt_len(mbuf);
360 struct ipv6_addr *pkt_src_ip6;
361 if ((pkt_src_ip6 = find_ip6(packet, len, ip_dst, vlan)) == NULL) {
362 // Unable to find IP address => non IP packet => send it as it
365 struct l3_base *l3 = &(tbase->l3);
366 if (memcmp(&l3->local_ipv6, ip_dst, 8) == 0) {
367 // Same prefix as local -> use local
368 used_ip_src = &l3->local_ipv6;
369 } else if (memcmp(&l3->global_ipv6 , &null_addr, 16) != 0) {
370 // Global IP is defined -> use it
371 used_ip_src = &l3->global_ipv6;
373 plog_info("Error as trying to send a packet to "IPv6_BYTES_FMT" using "IPv6_BYTES_FMT" (local)\n", IPv6_BYTES(ip_dst->bytes), IPv6_BYTES(l3->local_ipv6.bytes));
377 memcpy(pkt_src_ip6, used_ip_src, sizeof(struct ipv6_addr));
378 if (likely(l3->n_pkts < 4)) {
379 for (unsigned int idx = 0; idx < l3->n_pkts; idx++) {
380 if (memcmp(ip_dst, &l3->optimized_arp_table[idx].ip6, sizeof(struct ipv6_addr)) == 0) {
381 // IP address already in table
382 if ((tsc < l3->optimized_arp_table[idx].arp_ndp_retransmit_timeout) && (tsc < l3->optimized_arp_table[idx].reachable_timeout)) {
383 // MAC address was recently updated in table, use it
384 // plog_dbg("Valid MAC address found => send packet\n");
385 memcpy(mac, &l3->optimized_arp_table[idx].mac, sizeof(prox_rte_ether_addr));
387 } else if (tsc > l3->optimized_arp_table[idx].arp_ndp_retransmit_timeout) {
388 // NDP not sent since a long time, send NDP
389 l3->optimized_arp_table[idx].arp_ndp_retransmit_timeout = tsc + l3->arp_ndp_retransmit_timeout * hz / 1000;
390 if (tsc < l3->optimized_arp_table[idx].reachable_timeout) {
391 // MAC still valid => also send mbuf
392 plog_dbg("Valid MAC found but NDP retransmit timeout => send packet and NDP\n");
393 memcpy(mac, &l3->optimized_arp_table[idx].mac, sizeof(prox_rte_ether_addr));
394 return SEND_MBUF_AND_ARP_ND;
396 plog_dbg("Unknown MAC => send NDP but cannot send packet\n");
397 // MAC unvalid => only send NDP
401 // NDP timeout elapsed, MAC not valid anymore but waiting for NDP reply
402 // plog_dbg("NDP reachable timeout elapsed - waiting for NDP reply\n");
407 // IP address not found in table
408 memcpy(&l3->optimized_arp_table[l3->n_pkts].ip6, ip_dst, sizeof(struct ipv6_addr));
409 l3->optimized_arp_table[l3->n_pkts].arp_ndp_retransmit_timeout = tsc + l3->arp_ndp_retransmit_timeout * hz / 1000;
412 if (l3->n_pkts < 4) {
416 // We have too many IP addresses to search linearly; lets use hash table instead => copy all entries in hash table
417 for (uint32_t idx = 0; idx < l3->n_pkts; idx++) {
418 struct ipv6_addr *ip6 = &l3->optimized_arp_table[idx].ip6;
419 int ret = rte_hash_add_key(l3->ip6_hash, (const void *)ip6);
421 // This should not happen as few entries so far.
422 // If it happens, we still send the NDP as easier:
423 // If the NDP corresponds to this error, the NDP reply will be ignored
424 // If NDP does not correspond to this error/ip, then NDP reply will be handled.
425 plogx_err("Unable add ip "IPv6_BYTES_FMT" in mac_hash (already %d entries)\n", IPv6_BYTES(ip6->bytes), idx);
427 memcpy(&l3->arp_table[ret], &l3->optimized_arp_table[idx], sizeof(struct arp_table));
432 // Find IP in lookup table. Send ND if not found
433 int ret = rte_hash_lookup(l3->ip6_hash, (const void *)ip_dst);
434 if (unlikely(ret < 0)) {
435 // IP not found, try to send an ND
436 int ret = rte_hash_add_key(l3->ip6_hash, (const void *)ip_dst);
438 // No reason to send NDP, as reply would be anyhow ignored
439 plogx_err("Unable to add ip "IPv6_BYTES_FMT" in mac_hash\n", IPv6_BYTES(ip_dst->bytes));
442 memcpy(&l3->arp_table[ret].ip6, ip_dst, sizeof(struct ipv6_addr));
443 l3->arp_table[ret].arp_ndp_retransmit_timeout = tsc + l3->arp_ndp_retransmit_timeout * hz / 1000;
448 if (likely((tsc < l3->arp_table[ret].arp_ndp_retransmit_timeout) && (tsc < l3->arp_table[ret].reachable_timeout))) {
449 // MAC still valid and NDP sent recently
450 memcpy(mac, &l3->arp_table[ret].mac, sizeof(prox_rte_ether_addr));
452 } else if (tsc > l3->arp_table[ret].arp_ndp_retransmit_timeout) {
453 // NDP not sent since a long time, send NDP
454 l3->arp_table[ret].arp_ndp_retransmit_timeout = tsc + l3->arp_ndp_retransmit_timeout * hz / 1000;
455 if (tsc < l3->arp_table[ret].reachable_timeout) {
456 // MAC still valid => send also MBUF
457 memcpy(mac, &l3->arp_table[ret].mac, sizeof(prox_rte_ether_addr));
458 return SEND_MBUF_AND_ARP_ND;
471 void task_init_l3(struct task_base *tbase, struct task_args *targ)
473 static char hash_name[30];
474 uint32_t n_entries = MAX_ARP_ENTRIES * 4;
475 const int socket_id = rte_lcore_to_socket_id(targ->lconf->id);
476 sprintf(hash_name, "A%03d_%03d_mac_table", targ->lconf->id, targ->id);
480 struct rte_hash_parameters hash_params = {
482 .entries = n_entries,
483 .key_len = sizeof(uint32_t),
484 .hash_func = rte_hash_crc,
485 .hash_func_init_val = 0,
487 if (targ->flags & TASK_ARG_L3) {
488 plog_info("\t\tInitializing L3 (IPv4)\n");
489 tbase->l3.ip_hash = rte_hash_create(&hash_params);
490 PROX_PANIC(tbase->l3.ip_hash == NULL, "Failed to set up ip hash table\n");
494 if (targ->flags & TASK_ARG_NDP) {
495 plog_info("\t\tInitializing NDP (IPv6)\n");
496 hash_params.key_len = sizeof(struct ipv6_addr);
497 tbase->l3.ip6_hash = rte_hash_create(&hash_params);
498 PROX_PANIC(tbase->l3.ip6_hash == NULL, "Failed to set up ip hash table\n");
500 tbase->l3.arp_table = (struct arp_table *)prox_zmalloc(n_entries * sizeof(struct arp_table), socket_id);
501 PROX_PANIC(tbase->l3.arp_table == NULL, "Failed to allocate memory for %u entries in arp/ndp table\n", n_entries);
502 plog_info("\t\tarp/ndp table, with %d entries of size %ld\n", n_entries, sizeof(struct l3_base));
504 targ->lconf->ctrl_func_p[targ->task] = handle_ctrl_plane_pkts;
505 targ->lconf->ctrl_timeout = freq_to_tsc(targ->ctrl_freq);
506 tbase->l3.gw.ip = rte_cpu_to_be_32(targ->gateway_ipv4);
507 tbase->flags |= TASK_L3;
508 tbase->l3.core_id = targ->lconf->id;
509 tbase->l3.task_id = targ->id;
510 tbase->l3.tmaster = targ->tmaster;
511 tbase->l3.seed = (uint)rte_rdtsc();
512 if (targ->reachable_timeout != 0)
513 tbase->l3.reachable_timeout = targ->reachable_timeout;
515 tbase->l3.reachable_timeout = DEFAULT_ARP_TIMEOUT;
516 if (targ->arp_ndp_retransmit_timeout != 0)
517 tbase->l3.arp_ndp_retransmit_timeout = targ->arp_ndp_retransmit_timeout;
519 tbase->l3.arp_ndp_retransmit_timeout = DEFAULT_ARP_UPDATE_TIME;
522 void task_start_l3(struct task_base *tbase, struct task_args *targ)
524 const int socket_id = rte_lcore_to_socket_id(targ->lconf->id);
525 const int NB_ARP_ND_MBUF = 1024;
526 const int ARP_ND_MBUF_SIZE = 2048;
527 const int NB_CACHE_ARP_ND_MBUF = 256;
529 struct prox_port_cfg *port = find_reachable_port(targ);
530 if (port && (tbase->l3.arp_nd_pool == NULL)) {
531 static char name[] = "arp0_pool";
532 tbase->l3.reachable_port_id = port - prox_port_cfg;
533 if ((targ->local_ipv4 && port->ip) && (targ->local_ipv4 != port->ip)) {
534 PROX_PANIC(1, "local_ipv4 in core section ("IPv4_BYTES_FMT") differs from port section ("IPv4_BYTES_FMT")\n", IP4(rte_be_to_cpu_32(targ->local_ipv4)), IP4(rte_be_to_cpu_32(port->ip)));
536 if ((targ->local_ipv4 && port->ip) && (targ->local_prefix != port->prefix)) {
537 PROX_PANIC(1, "local_ipv4 prefix in core section (%d) differs from port section (%d)\n", targ->local_prefix, port->prefix);
539 if (!targ->local_ipv4) {
540 targ->local_ipv4 = port->ip;
541 targ->local_prefix = port->prefix;
542 plog_info("Setting core local_ipv4 from port %d local_ipv4 to "IPv4_BYTES_FMT"\n", tbase->l3.reachable_port_id, IP4(rte_be_to_cpu_32(port->ip)));
544 if (targ->local_ipv4) {
545 tbase->l3.local_ipv4 = rte_be_to_cpu_32(targ->local_ipv4);
546 register_ip_to_ctrl_plane(tbase->l3.tmaster, tbase->l3.local_ipv4, tbase->l3.reachable_port_id, targ->lconf->id, targ->id);
548 if (strcmp(targ->route_table, "") != 0) {
552 PROX_PANIC(tbase->l3.local_ipv4 == 0, "missing local_ipv4 while route table is specified in L3 mode\n");
554 // LPM might be modified runtime => do not share with other cores
555 ret = lua_to_lpm4(prox_lua(), GLOBAL, targ->route_table, socket_id, &lpm);
556 PROX_PANIC(ret, "Failed to load IPv4 LPM:\n%s\n", get_lua_to_errors());
558 tbase->l3.ipv4_lpm = lpm->rte_lpm;
559 tbase->l3.next_hops = prox_zmalloc(sizeof(*tbase->l3.next_hops) * MAX_HOP_INDEX, socket_id);
560 PROX_PANIC(tbase->l3.next_hops == NULL, "Could not allocate memory for next hop\n");
562 for (uint32_t i = 0; i < MAX_HOP_INDEX; i++) {
563 if (!lpm->next_hops[i].ip_dst)
566 tbase->l3.next_hops[i].ip = rte_bswap32(lpm->next_hops[i].ip_dst);
567 int tx_port = lpm->next_hops[i].mac_port.out_idx;
568 // gen only supports one port right now .... hence port = 0
569 if ((tx_port > targ->nb_txports - 1) && (tx_port > targ->nb_txrings - 1)) {
570 PROX_PANIC(1, "Routing Table contains port %d but only %d tx port/ %d ring:\n", tx_port, targ->nb_txports, targ->nb_txrings);
573 plog_info("Using routing table %s in l3 mode, with %d gateways\n", targ->route_table, tbase->l3.nb_gws);
575 // Last but one "next_hop_index" is not a gateway but direct routes
576 tbase->l3.next_hops[tbase->l3.nb_gws].ip = 0;
577 ret = rte_lpm_add(tbase->l3.ipv4_lpm, targ->local_ipv4, targ->local_prefix, tbase->l3.nb_gws++);
578 PROX_PANIC(ret, "Failed to add local_ipv4 "IPv4_BYTES_FMT"/%d to lpm\n", IP4(tbase->l3.local_ipv4), targ->local_prefix);
579 // Last "next_hop_index" is default gw
580 tbase->l3.next_hops[tbase->l3.nb_gws].ip = rte_bswap32(targ->gateway_ipv4);
581 if (targ->gateway_ipv4) {
582 ret = rte_lpm_add(tbase->l3.ipv4_lpm, targ->gateway_ipv4, 0, tbase->l3.nb_gws++);
583 PROX_PANIC(ret, "Failed to add gateway_ipv4 "IPv4_BYTES_FMT"/%d to lpm\n", IP4(tbase->l3.gw.ip), 0);
587 master_init_vdev(tbase->l3.tmaster, tbase->l3.reachable_port_id, targ->lconf->id, targ->id);
589 // Create IPv6 addr if none were configured
590 if (targ->flags & TASK_ARG_NDP) {
591 if (!memcmp(&targ->local_ipv6, &null_addr, 16)) {
592 set_link_local(&targ->local_ipv6);
593 set_EUI(&targ->local_ipv6, &port->eth_addr);
595 plog_info("\tCore %d, task %d, local IPv6 addr is "IPv6_BYTES_FMT" (%s)\n",
596 targ->lconf->id, targ->id,
597 IPv6_BYTES(targ->local_ipv6.bytes),
598 IP6_Canonical(&targ->local_ipv6));
599 memcpy(&tbase->l3.local_ipv6, &targ->local_ipv6, sizeof(struct ipv6_addr));
601 if (memcmp(&targ->global_ipv6, &null_addr, sizeof(struct ipv6_addr))) {
602 memcpy(&tbase->l3.global_ipv6, &targ->global_ipv6, sizeof(struct ipv6_addr));
603 plog_info("\tCore %d, task %d, global IPv6 addr is "IPv6_BYTES_FMT" (%s)\n",
604 targ->lconf->id, targ->id,
605 IPv6_BYTES(targ->global_ipv6.bytes),
606 IP6_Canonical(&targ->global_ipv6));
608 if (targ->ipv6_router)
609 register_router_to_ctrl_plane(tbase->l3.tmaster, tbase->l3.reachable_port_id, targ->lconf->id, targ->id, &targ->local_ipv6, &targ->global_ipv6, &targ->router_prefix);
611 register_node_to_ctrl_plane(tbase->l3.tmaster, &targ->local_ipv6, &targ->global_ipv6, tbase->l3.reachable_port_id, targ->lconf->id, targ->id);
615 struct rte_mempool *ret = rte_mempool_create(name, NB_ARP_ND_MBUF, ARP_ND_MBUF_SIZE, NB_CACHE_ARP_ND_MBUF,
616 sizeof(struct rte_pktmbuf_pool_private), rte_pktmbuf_pool_init, NULL, rte_pktmbuf_init, 0,
618 PROX_PANIC(ret == NULL, "Failed to allocate ARP/ND memory pool on socket %u with %u elements\n",
619 rte_socket_id(), NB_ARP_ND_MBUF);
620 plog_info("\tMempool %p (%s) size = %u * %u cache %u, socket %d (for ARP/ND)\n", ret, name, NB_ARP_ND_MBUF,
621 ARP_ND_MBUF_SIZE, NB_CACHE_ARP_ND_MBUF, rte_socket_id());
622 tbase->l3.arp_nd_pool = ret;
623 if ((targ->flags & TASK_ARG_NDP) && (!targ->ipv6_router)) {
624 plog_info("Sending Router Sollicitation\n");
625 send_router_sollicitation(tbase, targ);
627 if ((targ->flags & TASK_ARG_NDP) && (targ->flags & TASK_ARG_SEND_NA_AT_STARTUP)) {
628 plog_info("Sending unsollicited Neighbour Advertisement\n");
629 send_unsollicited_neighbour_advertisement(tbase, targ);
635 void task_set_gateway_ip(struct task_base *tbase, uint32_t ip)
637 tbase->l3.gw.ip = ip;
638 tbase->flags &= ~FLAG_DST_MAC_KNOWN;
641 void task_set_local_ip(struct task_base *tbase, uint32_t ip)
643 tbase->l3.local_ipv4 = ip;
646 static void reset_arp_ndp_retransmit_timeout(struct l3_base *l3, uint32_t ip)
649 plogx_dbg("MAC entry for IP "IPv4_BYTES_FMT" timeout in kernel\n", IP4(ip));
652 int ret = rte_hash_lookup(l3->ip_hash, (const void *)&ip);
654 l3->arp_table[ret].arp_ndp_retransmit_timeout = 0;
655 } else if (ip == l3->gw.ip) {
656 l3->gw.arp_ndp_retransmit_timeout = 0;
657 } else if (l3->n_pkts < 4) {
658 for (idx = 0; idx < l3->n_pkts; idx++) {
659 uint32_t ip_dst = l3->optimized_arp_table[idx].ip;
663 if (idx < l3->n_pkts) {
664 l3->optimized_arp_table[idx].arp_ndp_retransmit_timeout = 0;
667 int ret = rte_hash_lookup(l3->ip_hash, (const void *)&ip);
669 l3->arp_table[ret].arp_ndp_retransmit_timeout = 0;
674 static prox_next_hop_index_type get_nh_index(struct task_base *tbase, uint32_t gw_ip)
676 // Check if gateway already exists
677 for (prox_next_hop_index_type i = 0; i < tbase->l3.nb_gws; i++) {
678 if (tbase->l3.next_hops[i].ip == gw_ip) {
682 if (tbase->l3.nb_gws < MAX_HOP_INDEX) {
683 tbase->l3.next_hops[tbase->l3.nb_gws].ip = gw_ip;
685 return tbase->l3.nb_gws - 1;
687 return MAX_HOP_INDEX;
689 void handle_ctrl_plane_pkts(struct task_base *tbase, struct rte_mbuf **mbufs, uint16_t n_pkts)
692 const uint64_t hz = rte_get_tsc_hz();
693 uint32_t ip, ip_dst, idx, gateway_ip, prefix;
694 prox_next_hop_index_type gateway_index;
695 int j, ret, modified_route;
697 struct ipv6_addr *ip6, *ip6_dst;
699 prox_rte_ether_hdr *hdr;
700 struct ether_hdr_arp *hdr_arp;
701 struct l3_base *l3 = &tbase->l3;
702 uint64_t tsc= rte_rdtsc();
703 uint64_t reachable_timeout = l3->reachable_timeout * hz / 1000;
705 prox_rte_ipv4_hdr *pip;
706 prox_rte_udp_hdr *udp_hdr;
707 uint8_t port = tbase->l3.reachable_port_id;
709 for (j = 0; j < n_pkts; ++j) {
712 for (j = 0; j < n_pkts; ++j) {
713 PREFETCH0(rte_pktmbuf_mtod(mbufs[j], void *));
716 for (j = 0; j < n_pkts; ++j) {
719 out[0] = OUT_HANDLED;
720 command = get_command(mbufs[j]);
721 plogx_dbg("\tReceived %s mbuf %p\n", actions_string[command], mbufs[j]);
723 case ROUTE_ADD_FROM_MASTER:
724 ip = ctrl_ring_get_ip(mbufs[j]);
725 gateway_ip = ctrl_ring_get_gateway_ip(mbufs[j]);
726 prefix = ctrl_ring_get_prefix(mbufs[j]);
727 gateway_index = get_nh_index(tbase, gateway_ip);
728 if (gateway_index >= MAX_HOP_INDEX) {
729 plog_err("Unable to find or define gateway index - too many\n");
732 modified_route = rte_lpm_is_rule_present(tbase->l3.ipv4_lpm, rte_bswap32(ip), prefix, &nh);
733 ret = rte_lpm_add(tbase->l3.ipv4_lpm, rte_bswap32(ip), prefix, gateway_index);
735 plog_err("Failed to add route to "IPv4_BYTES_FMT"/%d using "IPv4_BYTES_FMT"(index = %d)\n", IP4(ip), prefix, IP4(gateway_ip), gateway_index);
736 } else if (modified_route)
737 plogx_dbg("Modified route to "IPv4_BYTES_FMT"/%d using "IPv4_BYTES_FMT"(index = %d) (was using "IPv4_BYTES_FMT"(index = %d)\n", IP4(ip), prefix, IP4(gateway_ip), gateway_index, IP4(tbase->l3.next_hops[nh].ip), nh);
739 plogx_dbg("Added new route to "IPv4_BYTES_FMT"/%d using "IPv4_BYTES_FMT"(index = %d)\n", IP4(ip), prefix, IP4(gateway_ip), gateway_index);
743 case ROUTE_DEL_FROM_MASTER:
744 ip = ctrl_ring_get_ip(mbufs[j]);
745 prefix = ctrl_ring_get_prefix(mbufs[j]);
747 ret = rte_lpm_is_rule_present(tbase->l3.ipv4_lpm, rte_bswap32(ip), prefix, &nh);
749 ret = rte_lpm_delete(tbase->l3.ipv4_lpm, rte_bswap32(ip), prefix);
751 plog_err("Failed to add rule\n");
753 plog_info("Deleting route to "IPv4_BYTES_FMT"/%d\n", IP4(ip), prefix);
757 case MAC_INFO_FROM_MASTER:
758 hdr_arp = rte_pktmbuf_mtod(mbufs[j], struct ether_hdr_arp *);
759 ip = get_ip(mbufs[j]);
761 if (prox_rte_is_zero_ether_addr(&hdr_arp->arp.data.sha)) {
762 // MAC timeout or deleted from kernel table => reset update_time
763 // This will cause us to send new ARP request
764 // However, as reachable_timeout not touched, we should continue sending our regular IP packets
765 reset_arp_ndp_retransmit_timeout(l3, ip);
768 plogx_dbg("\tUpdating MAC entry for IP "IPv4_BYTES_FMT" with MAC "MAC_BYTES_FMT"\n",
769 IP4(ip), MAC_BYTES(hdr_arp->arp.data.sha.addr_bytes));
773 struct arp_table *entry;
774 ret = rte_hash_add_key(l3->ip_hash, (const void *)&ip);
776 plogx_info("Unable add ip "IPv4_BYTES_FMT" in mac_hash\n", IP4(ip));
777 } else if ((nh = l3->arp_table[ret].nh) != MAX_HOP_INDEX) {
778 entry = &l3->next_hops[nh];
779 memcpy(&entry->mac, &(hdr_arp->arp.data.sha), sizeof(prox_rte_ether_addr));
780 entry->reachable_timeout = tsc + reachable_timeout;
781 update_arp_ndp_retransmit_timeout(l3, &entry->arp_ndp_retransmit_timeout, l3->arp_ndp_retransmit_timeout);
783 memcpy(&l3->arp_table[ret].mac, &(hdr_arp->arp.data.sha), sizeof(prox_rte_ether_addr));
784 l3->arp_table[ret].reachable_timeout = tsc + reachable_timeout;
785 update_arp_ndp_retransmit_timeout(l3, &l3->arp_table[ret].arp_ndp_retransmit_timeout, l3->arp_ndp_retransmit_timeout);
788 else if (ip == l3->gw.ip) {
789 // MAC address of the gateway
790 memcpy(&l3->gw.mac, &hdr_arp->arp.data.sha, 6);
791 l3->flags |= FLAG_DST_MAC_KNOWN;
792 l3->gw.reachable_timeout = tsc + reachable_timeout;
793 update_arp_ndp_retransmit_timeout(l3, &l3->gw.arp_ndp_retransmit_timeout, l3->arp_ndp_retransmit_timeout);
794 } else if (l3->n_pkts < 4) {
795 // Few packets tracked - should be faster to loop through them thean using a hash table
796 for (idx = 0; idx < l3->n_pkts; idx++) {
797 ip_dst = l3->optimized_arp_table[idx].ip;
801 if (idx < l3->n_pkts) {
802 memcpy(&l3->optimized_arp_table[idx].mac, &(hdr_arp->arp.data.sha), sizeof(prox_rte_ether_addr));
803 l3->optimized_arp_table[idx].reachable_timeout = tsc + reachable_timeout;
804 update_arp_ndp_retransmit_timeout(l3, &l3->optimized_arp_table[idx].arp_ndp_retransmit_timeout, l3->arp_ndp_retransmit_timeout);
807 ret = rte_hash_add_key(l3->ip_hash, (const void *)&ip);
809 plogx_info("Unable add ip "IPv4_BYTES_FMT" in mac_hash\n", IP4(ip));
811 memcpy(&l3->arp_table[ret].mac, &(hdr_arp->arp.data.sha), sizeof(prox_rte_ether_addr));
812 l3->arp_table[ret].reachable_timeout = tsc + reachable_timeout;
813 update_arp_ndp_retransmit_timeout(l3, &l3->arp_table[ret].arp_ndp_retransmit_timeout, l3->arp_ndp_retransmit_timeout);
818 case MAC_INFO_FROM_MASTER_FOR_IPV6:
819 ip6 = ctrl_ring_get_ipv6_addr(mbufs[j]);
820 uint64_t data = ctrl_ring_get_data(mbufs[j]);
822 if (l3->n_pkts < 4) {
823 // Few packets tracked - should be faster to loop through them thean using a hash table
824 for (idx = 0; idx < l3->n_pkts; idx++) {
825 ip6_dst = &l3->optimized_arp_table[idx].ip6;
826 if (memcmp(ip6_dst, ip6, sizeof(struct ipv6_addr)) == 0)
829 if (idx < l3->n_pkts) {
830 // IP found; this is a reply for one of our requests!
831 memcpy(&l3->optimized_arp_table[idx].mac, &data, sizeof(prox_rte_ether_addr));
832 l3->optimized_arp_table[idx].reachable_timeout = tsc + l3->reachable_timeout * hz / 1000;
835 int ret = rte_hash_add_key(l3->ip6_hash, (const void *)ip6);
837 plogx_info("Unable add ip "IPv6_BYTES_FMT" in mac_hash\n", IPv6_BYTES(ip6->bytes));
839 memcpy(&l3->arp_table[ret].mac, &data, sizeof(prox_rte_ether_addr));
840 l3->arp_table[ret].reachable_timeout = tsc + l3->reachable_timeout * hz / 1000;
845 case SEND_NDP_FROM_MASTER:
846 case SEND_ARP_REQUEST_FROM_MASTER:
847 case SEND_ARP_REPLY_FROM_MASTER:
849 // tx_ctrlplane_pkt does not drop packets
850 plogx_dbg("\tForwarding (ARP) packet from master\n");
851 tbase->aux->tx_ctrlplane_pkt(tbase, &mbufs[j], 1, out);
852 TASK_STATS_ADD_TX_NON_DP(&tbase->aux->stats, 1);
854 case SEND_ICMP_FROM_MASTER:
856 // tx_ctrlplane_pkt does not drop packets
857 plogx_dbg("\tForwarding (PING) packet from master\n");
858 tbase->aux->tx_ctrlplane_pkt(tbase, &mbufs[j], 1, out);
859 TASK_STATS_ADD_TX_NON_DP(&tbase->aux->stats, 1);
862 // Drop Pseudo packets sent to generate ARP requests
863 // There are other IPv4 packets sent from TAP which we cannot delete e.g. BGP packets
865 hdr = rte_pktmbuf_mtod(mbufs[j], prox_rte_ether_hdr *);
866 if (hdr->ether_type == ETYPE_IPv4) {
867 pip = (prox_rte_ipv4_hdr *)(hdr + 1);
868 } else if (hdr->ether_type == ETYPE_VLAN) {
869 prox_rte_vlan_hdr *vlan = (prox_rte_vlan_hdr *)(hdr + 1);
870 vlan = (prox_rte_vlan_hdr *)(hdr + 1);
871 if (vlan->eth_proto == ETYPE_IPv4) {
872 pip = (prox_rte_ipv4_hdr *)(vlan + 1);
875 if (pip && (pip->next_proto_id == IPPROTO_UDP)) {
876 udp_hdr = (prox_rte_udp_hdr *)(pip + 1);
877 if ((udp_hdr->dst_port == rte_cpu_to_be_16(PROX_PSEUDO_PKT_PORT)) &&
878 (udp_hdr->src_port == rte_cpu_to_be_16(PROX_PSEUDO_PKT_PORT)) &&
879 (rte_be_to_cpu_16(udp_hdr->dgram_len) == 8)) {
880 plogx_dbg("Dropping PROX packet\n");
886 uint16_t src_port = 0, dst_port = 0, len = 0;
888 src_port = udp_hdr->src_port;
889 dst_port = udp_hdr->dst_port;
890 len = rte_be_to_cpu_16(udp_hdr->dgram_len);
892 plogx_dbg("tForwarding TAP packet from master. Type = %x, pip=%p, udp = %p, udp = {src = %x, dst = %x, len = %d}\n", hdr->ether_type, pip, udp_hdr, src_port, dst_port,len );
894 // tx_ctrlplane_pkt does not drop packets
895 tbase->aux->tx_ctrlplane_pkt(tbase, &mbufs[j], 1, out);
896 TASK_STATS_ADD_TX_NON_DP(&tbase->aux->stats, 1);
898 case IPV6_INFO_FROM_MASTER:
899 // addr = ctrl_ring_get_data(mbufs[j]);
900 ip6 = ctrl_ring_get_ipv6_addr(mbufs[j]);
901 if (memcmp(&l3->global_ipv6 , &null_addr, 16) == 0) {
902 memcpy(&l3->global_ipv6, ip6, sizeof(struct ipv6_addr));
903 plog_info("Core %d task %d received global IP "IPv6_BYTES_FMT"\n", l3->core_id, l3->task_id, IPv6_BYTES(ip6->bytes));
904 } else if (memcmp(&l3->global_ipv6, ip6, 8) == 0) {
905 if (l3->prefix_printed == 0) {
906 plog_info("Core %d task %d received expected prefix "IPv6_PREFIX_FMT"\n", l3->core_id, l3->task_id, IPv6_PREFIX(ip6->bytes));
907 l3->prefix_printed = 1;
910 plog_warn("Core %d task %d received unexpected prefix "IPv6_PREFIX_FMT", IP = "IPv6_PREFIX_FMT"\n", l3->core_id, l3->task_id, IPv6_PREFIX(ip6->bytes), IPv6_PREFIX(l3->global_ipv6.bytes));
915 plog_err("Unexpected message received: %d\n", command);