2 // Copyright (c) 2010-2020 Intel Corporation
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
17 #include <rte_lcore.h>
19 #include <rte_hash_crc.h>
22 #include "task_base.h"
27 #include "handle_master.h"
28 #include "prox_port_cfg.h"
29 #include "packet_utils.h"
30 #include "prox_shared.h"
32 #include "hash_entry_types.h"
33 #include "prox_compat.h"
36 #include "prox_ipv6.h"
39 static inline int find_ip(struct ether_hdr_arp *pkt, uint16_t len, uint32_t *ip_dst, uint16_t *vlan)
41 prox_rte_vlan_hdr *vlan_hdr;
42 prox_rte_ether_hdr *eth_hdr = (prox_rte_ether_hdr*)pkt;
43 prox_rte_ipv4_hdr *ip;
44 uint16_t ether_type = eth_hdr->ether_type;
45 uint16_t l2_len = sizeof(prox_rte_ether_hdr);
49 while (((ether_type == ETYPE_VLAN) || (ether_type == ETYPE_8021ad)) && (l2_len + sizeof(prox_rte_vlan_hdr) < len)) {
50 vlan_hdr = (prox_rte_vlan_hdr *)((uint8_t *)pkt + l2_len);
52 ether_type = vlan_hdr->eth_proto;
53 *vlan = rte_be_to_cpu_16(vlan_hdr->vlan_tci & 0xFF0F); // Store VLAN, or CVLAN if QinQ
59 // In case of MPLS, next hop MAC is based on MPLS, not destination IP
71 plog_warn("Unsupported packet type %x - CRC might be wrong\n", ether_type);
75 if (l2_len && (l2_len + sizeof(prox_rte_ipv4_hdr) <= len)) {
76 prox_rte_ipv4_hdr *ip = (prox_rte_ipv4_hdr *)((uint8_t *)pkt + l2_len);
77 // TODO: implement LPM => replace ip_dst by next hop IP DST
78 *ip_dst = ip->dst_addr;
84 static inline void find_vlan(struct ether_hdr_arp *pkt, uint16_t len, uint16_t *vlan)
86 prox_rte_vlan_hdr *vlan_hdr;
87 prox_rte_ether_hdr *eth_hdr = (prox_rte_ether_hdr*)pkt;
88 uint16_t ether_type = eth_hdr->ether_type;
89 uint16_t l2_len = sizeof(prox_rte_ether_hdr);
93 while (((ether_type == ETYPE_8021ad) || (ether_type == ETYPE_VLAN)) && (l2_len + sizeof(prox_rte_vlan_hdr) < len)) {
94 vlan_hdr = (prox_rte_vlan_hdr *)((uint8_t *)pkt + l2_len);
96 ether_type = vlan_hdr->eth_proto;
97 *vlan = rte_be_to_cpu_16(vlan_hdr->vlan_tci & 0xFF0F); // Store VLAN, or CVLAN if QinQ
101 static inline struct ipv6_addr *find_ip6(prox_rte_ether_hdr *pkt, uint16_t len, struct ipv6_addr *ip_dst, uint16_t *vlan)
103 uint16_t ether_type = pkt->ether_type;
104 uint16_t l2_len = sizeof(prox_rte_ether_hdr);
107 if ((ether_type == ETYPE_VLAN) || (ether_type == ETYPE_8021ad)) {
108 prox_rte_vlan_hdr *vlan_hdr = (prox_rte_vlan_hdr *)((uint8_t *)pkt + l2_len);
109 ether_type = vlan_hdr->eth_proto;
111 *vlan = rte_be_to_cpu_16(vlan_hdr->vlan_tci & 0xFF0F);
112 if (ether_type == ETYPE_VLAN) {
113 vlan_hdr = (prox_rte_vlan_hdr *)(vlan_hdr + 1);
114 ether_type = vlan_hdr->eth_proto;
116 *vlan = rte_be_to_cpu_16(vlan_hdr->vlan_tci & 0xFF0F);
119 if ((ether_type == ETYPE_IPv6) && (l2_len + sizeof(prox_rte_ipv6_hdr) <= len)) {
120 prox_rte_ipv6_hdr *ip = (prox_rte_ipv6_hdr *)((uint8_t *)pkt + l2_len);
121 // TODO: implement LPM => replace ip_dst by next hop IP DST
122 memcpy(ip_dst, &ip->dst_addr, sizeof(struct ipv6_addr));
123 return (struct ipv6_addr *)&ip->src_addr;
128 void send_unsollicited_neighbour_advertisement(struct task_base *tbase)
131 uint8_t out = 0, port_id = tbase->l3.reachable_port_id;
132 struct rte_mbuf *mbuf = NULL;
134 if (*(__int128 *)(&tbase->l3.local_ipv6) != 0) {
135 ret = rte_mempool_get(tbase->l3.arp_nd_pool, (void **)&mbuf);
136 if (likely(ret == 0)) {
137 mbuf->port = port_id;
138 build_neighbour_advertisement(tbase->l3.tmaster, mbuf, &prox_port_cfg[port_id].eth_addr, &tbase->l3.local_ipv6, PROX_UNSOLLICITED, prox_port_cfg[port_id].vlan_tags[0]);
139 tbase->aux->tx_ctrlplane_pkt(tbase, &mbuf, 1, &out);
140 TASK_STATS_ADD_TX_NON_DP(&tbase->aux->stats, 1);
142 plog_err("Failed to get a mbuf from arp/ndp mempool\n");
146 if (*(__int128 *)(&tbase->l3.global_ipv6) != 0) {
147 ret = rte_mempool_get(tbase->l3.arp_nd_pool, (void **)&mbuf);
148 if (likely(ret == 0)) {
149 mbuf->port = port_id;
150 build_neighbour_advertisement(tbase->l3.tmaster, mbuf, &prox_port_cfg[port_id].eth_addr, &tbase->l3.global_ipv6, PROX_UNSOLLICITED, prox_port_cfg[port_id].vlan_tags[0]);
151 tbase->aux->tx_ctrlplane_pkt(tbase, &mbuf, 1, &out);
152 TASK_STATS_ADD_TX_NON_DP(&tbase->aux->stats, 1);
154 plog_err("Failed to get a mbuf from arp/ndp mempool\n");
159 plog_err("No neighbor advertisement sent as no local or global ipv6\n");
163 static void send_router_sollicitation(struct task_base *tbase, struct task_args *targ)
166 uint8_t out = 0, port_id = tbase->l3.reachable_port_id;
167 struct rte_mbuf *mbuf;
169 ret = rte_mempool_get(tbase->l3.arp_nd_pool, (void **)&mbuf);
170 if (likely(ret == 0)) {
171 mbuf->port = port_id;
172 build_router_sollicitation(mbuf, &prox_port_cfg[port_id].eth_addr, &targ->local_ipv6, prox_port_cfg[port_id].vlan_tags[0]);
173 tbase->aux->tx_ctrlplane_pkt(tbase, &mbuf, 1, &out);
174 TASK_STATS_ADD_TX_NON_DP(&tbase->aux->stats, 1);
176 plog_err("Failed to get a mbuf from arp/ndp mempool\n");
180 /* This implementation could be improved: instead of checking each time we send a packet whether we need also
181 to send an ARP, we should only check whether the MAC is valid.
182 We should check arp_ndp_retransmit_timeout in the master process. This would also require the generating task to clear its arp ring
183 to avoid sending many ARP while starting after a long stop.
184 We could also check for reachable_timeout in the master so that dataplane has only to check whether MAC is available
185 but this would require either thread safety, or the the exchange of information between master and generating core.
188 static inline int add_key_and_send_arp(struct rte_hash *ip_hash, uint32_t *ip_dst, struct arp_table *entries, uint64_t tsc, uint64_t hz, uint32_t arp_ndp_retransmit_timeout, prox_next_hop_index_type nh, uint64_t **time)
190 int ret = rte_hash_add_key(ip_hash, (const void *)ip_dst);
191 if (unlikely(ret < 0)) {
192 // No reason to send ARP, as reply would be anyhow ignored
193 plogx_err("Unable to add ip "IPv4_BYTES_FMT" in mac_hash\n", IP4(*ip_dst));
196 entries[ret].ip = *ip_dst;
197 entries[ret].nh = nh;
198 *time = &entries[ret].arp_ndp_retransmit_timeout;
203 static inline int update_mac_and_send_mbuf(struct arp_table *entry, prox_rte_ether_addr *mac, uint64_t tsc, uint64_t hz, uint32_t arp_ndp_retransmit_timeout, uint64_t **time)
205 if (likely((tsc < entry->arp_ndp_retransmit_timeout) && (tsc < entry->reachable_timeout))) {
206 memcpy(mac, &entry->mac, sizeof(prox_rte_ether_addr));
208 } else if (tsc > entry->arp_ndp_retransmit_timeout) {
209 // long time since we have sent an arp, send arp
210 *time = &entry->arp_ndp_retransmit_timeout;
211 if (tsc < entry->reachable_timeout){
212 // MAC is valid in the table => send also the mbuf
213 memcpy(mac, &entry->mac, sizeof(prox_rte_ether_addr));
214 return SEND_MBUF_AND_ARP_ND;
216 // MAC still unknown, or timed out => only send ARP
220 // MAC is unknown and we already sent an ARP recently, drop mbuf and wait for ARP reply
224 int write_dst_mac(struct task_base *tbase, struct rte_mbuf *mbuf, uint32_t *ip_dst, uint16_t *vlan, uint64_t **time, uint64_t tsc)
226 const uint64_t hz = rte_get_tsc_hz();
227 struct ether_hdr_arp *packet = rte_pktmbuf_mtod(mbuf, struct ether_hdr_arp *);
228 prox_rte_ether_addr *mac = &packet->ether_hdr.d_addr;
229 prox_next_hop_index_type next_hop_index;
230 static uint64_t last_tsc = 0, n_no_route = 0;
232 struct l3_base *l3 = &(tbase->l3);
234 // First find the next hop
236 // A routing table was configured
237 // If a gw (gateway_ipv4) is also specified, it is used as default gw only i.e. lowest priority (shortest prefix)
238 // This is implemented automatically through lpm
239 uint16_t len = rte_pktmbuf_pkt_len(mbuf);
240 if (find_ip(packet, len, ip_dst, vlan) != 0) {
241 // Unable to find IP address => non IP packet => send it as it
244 if (unlikely(rte_lpm_lookup(l3->ipv4_lpm, rte_bswap32(*ip_dst), &next_hop_index) != 0)) {
245 // Prevent printing too many messages
247 if (tsc > last_tsc + rte_get_tsc_hz()) {
248 plogx_err("No route to IP "IPv4_BYTES_FMT" (%ld times)\n", IP4(*ip_dst), n_no_route);
254 struct arp_table *entry = &l3->next_hops[next_hop_index];
258 return update_mac_and_send_mbuf(entry, mac, tsc, hz, l3->arp_ndp_retransmit_timeout, time);
261 // no next ip: this is a local route
262 // Find IP in lookup table. Send ARP if not found
263 int ret = rte_hash_lookup(l3->ip_hash, (const void *)ip_dst);
264 if (unlikely(ret < 0)) {
265 // IP not found, try to send an ARP
266 return add_key_and_send_arp(l3->ip_hash, ip_dst, l3->arp_table, tsc, hz, l3->arp_ndp_retransmit_timeout, MAX_HOP_INDEX, time);
268 return update_mac_and_send_mbuf(&l3->arp_table[ret], mac, tsc, hz, l3->arp_ndp_retransmit_timeout, time);
272 // No Routing table specified: only a local ip and maybe a gateway
273 // Old default behavior: if a gw is specified, ALL packets go to this gateway (even those we could send w/o the gw
275 uint16_t len = rte_pktmbuf_pkt_len(mbuf);
277 find_vlan(packet, len, vlan);
278 if (likely((l3->flags & FLAG_DST_MAC_KNOWN) && (tsc < l3->gw.arp_ndp_retransmit_timeout) && (tsc < l3->gw.reachable_timeout))) {
279 memcpy(mac, &l3->gw.mac, sizeof(prox_rte_ether_addr));
281 } else if (tsc > l3->gw.arp_ndp_retransmit_timeout) {
282 // long time since we have successfully sent an arp, send arp
283 // If sending ARP failed (ring full) then arp_ndp_retransmit_timeout is not updated to avoid having to wait 1 sec to send ARP REQ again
284 *time = &l3->gw.arp_ndp_retransmit_timeout;
285 l3->gw.arp_ndp_retransmit_timeout = tsc + l3->arp_ndp_retransmit_timeout * hz / 1000;
288 if ((l3->flags & FLAG_DST_MAC_KNOWN) && (tsc < l3->gw.reachable_timeout)){
289 // MAC is valid in the table => send also the mbuf
290 memcpy(mac, &l3->gw.mac, sizeof(prox_rte_ether_addr));
291 return SEND_MBUF_AND_ARP_ND;
293 // MAC still unknown, or timed out => only send ARP
297 // MAC is unknown and we already sent an ARP recently, drop mbuf and wait for ARP reply
302 if (find_ip(packet, len, ip_dst, vlan) != 0) {
303 // Unable to find IP address => non IP packet => send it as it
306 if (likely(l3->n_pkts < 4)) {
307 for (unsigned int idx = 0; idx < l3->n_pkts; idx++) {
308 if (*ip_dst == l3->optimized_arp_table[idx].ip) {
309 return update_mac_and_send_mbuf(&l3->optimized_arp_table[idx], mac, tsc, hz, l3->arp_ndp_retransmit_timeout, time);
312 // IP address not found in table
313 l3->optimized_arp_table[l3->n_pkts].ip = *ip_dst;
314 *time = &l3->optimized_arp_table[l3->n_pkts].arp_ndp_retransmit_timeout;
317 if (l3->n_pkts < 4) {
321 // We have too many IP addresses to search linearly; lets use hash table instead => copy all entries in hash table
322 for (uint32_t idx = 0; idx < l3->n_pkts; idx++) {
323 uint32_t ip = l3->optimized_arp_table[idx].ip;
324 int ret = rte_hash_add_key(l3->ip_hash, (const void *)&ip);
326 // This should not happen as few entries so far.
327 // If it happens, we still send the ARP as easier:
328 // If the ARP corresponds to this error, the ARP reply will be ignored
329 // If ARP does not correspond to this error/ip, then ARP reply will be handled.
330 plogx_err("Unable add ip "IPv4_BYTES_FMT" in mac_hash (already %d entries)\n", IP4(ip), idx);
332 memcpy(&l3->arp_table[ret], &l3->optimized_arp_table[idx], sizeof(struct arp_table));
337 // Find IP in lookup table. Send ARP if not found
338 int ret = rte_hash_lookup(l3->ip_hash, (const void *)ip_dst);
339 if (unlikely(ret < 0)) {
340 // IP not found, try to send an ARP
341 return add_key_and_send_arp(l3->ip_hash, ip_dst, &l3->arp_table[ret], tsc, hz, l3->arp_ndp_retransmit_timeout, MAX_HOP_INDEX, time);
344 return update_mac_and_send_mbuf(&l3->arp_table[ret], mac, tsc, hz, l3->arp_ndp_retransmit_timeout, time);
351 int write_ip6_dst_mac(struct task_base *tbase, struct rte_mbuf *mbuf, struct ipv6_addr *ip_dst, uint16_t *vlan, uint64_t tsc)
353 const uint64_t hz = rte_get_tsc_hz();
354 prox_rte_ether_hdr *packet = rte_pktmbuf_mtod(mbuf, prox_rte_ether_hdr *);
355 prox_rte_ether_addr *mac = &packet->d_addr;
356 struct ipv6_addr *used_ip_src;
358 uint16_t len = rte_pktmbuf_pkt_len(mbuf);
360 struct ipv6_addr *pkt_src_ip6;
361 if ((pkt_src_ip6 = find_ip6(packet, len, ip_dst, vlan)) == NULL) {
362 // Unable to find IP address => non IP packet => send it as it
365 struct l3_base *l3 = &(tbase->l3);
367 // Configure source IP
368 if (*(uint64_t *)(&l3->local_ipv6) == *(uint64_t *)ip_dst) {
369 // Same prefix as local -> use local
370 used_ip_src = &l3->local_ipv6;
371 } else if (*(uint64_t *)(&l3->global_ipv6) == *(uint64_t *)ip_dst) {
372 // Same prefix as global -> use global
373 used_ip_src = &l3->global_ipv6;
374 } else if (*(__int128 *)(&l3->gw.ip6) != 0) {
375 used_ip_src = &l3->global_ipv6;
376 memcpy(ip_dst, &l3->gw.ip6, sizeof(struct ipv6_addr));
377 } else if (*(__int128 *)(&l3->global_ipv6) != 0) {
378 // Global IP is defined -> use it
379 used_ip_src = &l3->global_ipv6;
381 plog_info("Error as trying to send a packet to "IPv6_BYTES_FMT" using "IPv6_BYTES_FMT" (local)\n", IPv6_BYTES(ip_dst->bytes), IPv6_BYTES(l3->local_ipv6.bytes));
384 rte_memcpy(pkt_src_ip6, used_ip_src, sizeof(struct ipv6_addr));
387 if (likely(l3->n_pkts < 4)) {
388 for (unsigned int idx = 0; idx < l3->n_pkts; idx++) {
389 if (*(__int128 *)ip_dst == *(__int128 *)(&l3->optimized_arp_table[idx].ip6)) {
390 // IP address already in table
391 if ((tsc < l3->optimized_arp_table[idx].arp_ndp_retransmit_timeout) && (tsc < l3->optimized_arp_table[idx].reachable_timeout)) {
392 // MAC address was recently updated in table, use it
393 // plog_dbg("Valid MAC address found => send packet\n");
394 rte_memcpy(mac, &l3->optimized_arp_table[idx].mac, sizeof(prox_rte_ether_addr));
396 } else if (tsc > l3->optimized_arp_table[idx].arp_ndp_retransmit_timeout) {
397 // NDP not sent since a long time, send NDP
398 l3->optimized_arp_table[idx].arp_ndp_retransmit_timeout = tsc + l3->arp_ndp_retransmit_timeout * hz / 1000;
399 if (tsc < l3->optimized_arp_table[idx].reachable_timeout) {
400 // MAC still valid => also send mbuf
401 plog_dbg("Valid MAC found but NDP retransmit timeout => send packet and NDP\n");
402 memcpy(mac, &l3->optimized_arp_table[idx].mac, sizeof(prox_rte_ether_addr));
403 return SEND_MBUF_AND_ARP_ND;
405 plog_dbg("Unknown MAC => send NDP but cannot send packet\n");
406 // MAC unvalid => only send NDP
410 // NDP timeout elapsed, MAC not valid anymore but waiting for NDP reply
411 // plog_dbg("NDP reachable timeout elapsed - waiting for NDP reply\n");
416 // IP address not found in table
417 memcpy(&l3->optimized_arp_table[l3->n_pkts].ip6, ip_dst, sizeof(struct ipv6_addr));
418 l3->optimized_arp_table[l3->n_pkts].arp_ndp_retransmit_timeout = tsc + l3->arp_ndp_retransmit_timeout * hz / 1000;
421 if (l3->n_pkts < 4) {
425 // We have too many IP addresses to search linearly; lets use hash table instead => copy all entries in hash table
426 for (uint32_t idx = 0; idx < l3->n_pkts; idx++) {
427 struct ipv6_addr *ip6 = &l3->optimized_arp_table[idx].ip6;
428 int ret = rte_hash_add_key(l3->ip6_hash, (const void *)ip6);
430 // This should not happen as few entries so far.
431 // If it happens, we still send the NDP as easier:
432 // If the NDP corresponds to this error, the NDP reply will be ignored
433 // If NDP does not correspond to this error/ip, then NDP reply will be handled.
434 plogx_err("Unable add ip "IPv6_BYTES_FMT" in mac_hash (already %d entries)\n", IPv6_BYTES(ip6->bytes), idx);
436 memcpy(&l3->arp_table[ret], &l3->optimized_arp_table[idx], sizeof(struct arp_table));
441 // Find IP in lookup table. Send ND if not found
442 int ret = rte_hash_lookup(l3->ip6_hash, (const void *)ip_dst);
443 if (unlikely(ret < 0)) {
444 // IP not found, try to send an ND
445 int ret = rte_hash_add_key(l3->ip6_hash, (const void *)ip_dst);
447 // No reason to send NDP, as reply would be anyhow ignored
448 plogx_err("Unable to add ip "IPv6_BYTES_FMT" in mac_hash\n", IPv6_BYTES(ip_dst->bytes));
451 memcpy(&l3->arp_table[ret].ip6, ip_dst, sizeof(struct ipv6_addr));
452 l3->arp_table[ret].arp_ndp_retransmit_timeout = tsc + l3->arp_ndp_retransmit_timeout * hz / 1000;
457 if (likely((tsc < l3->arp_table[ret].arp_ndp_retransmit_timeout) && (tsc < l3->arp_table[ret].reachable_timeout))) {
458 // MAC still valid and NDP sent recently
459 memcpy(mac, &l3->arp_table[ret].mac, sizeof(prox_rte_ether_addr));
461 } else if (tsc > l3->arp_table[ret].arp_ndp_retransmit_timeout) {
462 // NDP not sent since a long time, send NDP
463 l3->arp_table[ret].arp_ndp_retransmit_timeout = tsc + l3->arp_ndp_retransmit_timeout * hz / 1000;
464 if (tsc < l3->arp_table[ret].reachable_timeout) {
465 // MAC still valid => send also MBUF
466 memcpy(mac, &l3->arp_table[ret].mac, sizeof(prox_rte_ether_addr));
467 return SEND_MBUF_AND_ARP_ND;
480 void task_init_l3(struct task_base *tbase, struct task_args *targ)
482 static char hash_name[30];
483 uint32_t n_entries = MAX_ARP_ENTRIES * 4;
484 const int socket_id = rte_lcore_to_socket_id(targ->lconf->id);
485 sprintf(hash_name, "A%03d_%03d_mac_table", targ->lconf->id, targ->id);
489 struct rte_hash_parameters hash_params = {
491 .entries = n_entries,
492 .key_len = sizeof(uint32_t),
493 .hash_func = rte_hash_crc,
494 .hash_func_init_val = 0,
495 .socket_id = socket_id,
497 if (targ->flags & TASK_ARG_L3) {
498 plog_info("\t\tInitializing L3 (IPv4)\n");
499 tbase->l3.ip_hash = rte_hash_create(&hash_params);
500 PROX_PANIC(tbase->l3.ip_hash == NULL, "Failed to set up ip hash table\n");
504 if (targ->flags & TASK_ARG_NDP) {
505 plog_info("\t\tInitializing NDP (IPv6)\n");
506 hash_params.key_len = sizeof(struct ipv6_addr);
507 tbase->l3.ip6_hash = rte_hash_create(&hash_params);
508 PROX_PANIC(tbase->l3.ip6_hash == NULL, "Failed to set up ip hash table\n");
510 tbase->l3.arp_table = (struct arp_table *)prox_zmalloc(n_entries * sizeof(struct arp_table), socket_id);
511 PROX_PANIC(tbase->l3.arp_table == NULL, "Failed to allocate memory for %u entries in arp/ndp table\n", n_entries);
512 plog_info("\t\tarp/ndp table, with %d entries of size %ld\n", n_entries, sizeof(struct l3_base));
514 targ->lconf->ctrl_func_p[targ->task] = handle_ctrl_plane_pkts;
515 targ->lconf->ctrl_timeout = freq_to_tsc(targ->ctrl_freq);
516 tbase->l3.gw.ip = rte_cpu_to_be_32(targ->gateway_ipv4);
517 memcpy(&tbase->l3.gw.ip6, &targ->gateway_ipv6, sizeof(struct ipv6_addr));
518 tbase->flags |= TASK_L3;
519 tbase->l3.core_id = targ->lconf->id;
520 tbase->l3.task_id = targ->id;
521 tbase->l3.tmaster = targ->tmaster;
522 tbase->l3.seed = (uint)rte_rdtsc();
523 if (targ->reachable_timeout != 0)
524 tbase->l3.reachable_timeout = targ->reachable_timeout;
526 tbase->l3.reachable_timeout = DEFAULT_ARP_TIMEOUT;
527 if (targ->arp_ndp_retransmit_timeout != 0)
528 tbase->l3.arp_ndp_retransmit_timeout = targ->arp_ndp_retransmit_timeout;
530 tbase->l3.arp_ndp_retransmit_timeout = DEFAULT_ARP_UPDATE_TIME;
533 void task_start_l3(struct task_base *tbase, struct task_args *targ)
535 const int socket_id = rte_lcore_to_socket_id(targ->lconf->id);
536 const int NB_ARP_ND_MBUF = 1024;
537 const int ARP_ND_MBUF_SIZE = 2048;
538 const int NB_CACHE_ARP_ND_MBUF = 256;
540 struct prox_port_cfg *port = find_reachable_port(targ);
541 if (port && (tbase->l3.arp_nd_pool == NULL)) {
542 static char name[] = "arp0_pool";
543 tbase->l3.reachable_port_id = port - prox_port_cfg;
544 if ((targ->local_ipv4 && port->ip_addr[0].ip) && (targ->local_ipv4 != port->ip_addr[0].ip)) {
545 PROX_PANIC(1, "local_ipv4 in core section ("IPv4_BYTES_FMT") differs from port section ("IPv4_BYTES_FMT")\n", IP4(rte_be_to_cpu_32(targ->local_ipv4)), IP4(rte_be_to_cpu_32(port->ip_addr[0].ip)));
547 if ((targ->local_ipv4 && port->ip_addr[0].ip) && (targ->local_prefix != port->ip_addr[0].prefix)) {
548 PROX_PANIC(1, "local_ipv4 prefix in core section (%d) differs from port section (%d)\n", targ->local_prefix, port->ip_addr[0].prefix);
550 if (!port->ip_addr[0].ip && targ->local_ipv4) {
551 port->ip_addr[0].ip = targ->local_ipv4;
552 port->ip_addr[0].prefix = targ->local_prefix;
554 port->vlan_tags[0] = 0;
555 plog_info("Setting port local_ipv4 from core %d local_ipv4 to "IPv4_BYTES_FMT"\n", tbase->l3.reachable_port_id, IP4(rte_be_to_cpu_32(port->ip_addr[0].ip)));
557 for (int vlan_id = 0; vlan_id < port->n_vlans; vlan_id++) {
558 if (port->ip_addr[vlan_id].ip)
559 register_ip_to_ctrl_plane(tbase->l3.tmaster, rte_be_to_cpu_32(port->ip_addr[vlan_id].ip), tbase->l3.reachable_port_id, targ->lconf->id, targ->id);
561 if (strcmp(targ->route_table, "") != 0) {
565 PROX_PANIC(port->n_vlans == 0, "missing local_ipv4 while route table is specified in L3 mode\n");
567 // LPM might be modified runtime => do not share with other cores
568 ret = lua_to_lpm4(prox_lua(), GLOBAL, targ->route_table, socket_id, &lpm);
569 PROX_PANIC(ret, "Failed to load IPv4 LPM:\n%s\n", get_lua_to_errors());
571 tbase->l3.ipv4_lpm = lpm->rte_lpm;
572 tbase->l3.next_hops = prox_zmalloc(sizeof(*tbase->l3.next_hops) * MAX_HOP_INDEX, socket_id);
573 PROX_PANIC(tbase->l3.next_hops == NULL, "Could not allocate memory for next hop\n");
575 for (uint32_t i = 0; i < MAX_HOP_INDEX; i++) {
576 if (!lpm->next_hops[i].ip_dst)
579 tbase->l3.next_hops[i].ip = rte_bswap32(lpm->next_hops[i].ip_dst);
580 int tx_port = lpm->next_hops[i].mac_port.out_idx;
581 // gen only supports one port right now .... hence port = 0
582 if ((tx_port > targ->nb_txports - 1) && (tx_port > targ->nb_txrings - 1)) {
583 PROX_PANIC(1, "Routing Table contains port %d but only %d tx port/ %d ring:\n", tx_port, targ->nb_txports, targ->nb_txrings);
586 plog_info("Using routing table %s in l3 mode, with %d gateways\n", targ->route_table, tbase->l3.nb_gws);
588 // Last but one (x n_vlans) "next_hop_index" is not a gateway but direct routes
589 for (int vlan_id = 0; vlan_id < port->n_vlans; vlan_id++) {
590 tbase->l3.next_hops[tbase->l3.nb_gws].ip = 0;
591 ret = rte_lpm_add(tbase->l3.ipv4_lpm, port->ip_addr[vlan_id].ip, port->ip_addr[vlan_id].prefix, tbase->l3.nb_gws++);
592 PROX_PANIC(ret, "Failed to add local_ipv4 "IPv4_BYTES_FMT"/%d to lpm\n", IP4(port->ip_addr[vlan_id].ip), port->ip_addr[vlan_id].prefix);
595 // Last "next_hop_index" is default gw
596 tbase->l3.next_hops[tbase->l3.nb_gws].ip = rte_bswap32(targ->gateway_ipv4);
597 if (targ->gateway_ipv4) {
598 ret = rte_lpm_add(tbase->l3.ipv4_lpm, targ->gateway_ipv4, 0, tbase->l3.nb_gws++);
599 PROX_PANIC(ret, "Failed to add gateway_ipv4 "IPv4_BYTES_FMT"/%d to lpm\n", IP4(tbase->l3.gw.ip), 0);
603 master_init_vdev(tbase->l3.tmaster, tbase->l3.reachable_port_id, targ->lconf->id, targ->id);
605 // Create IPv6 addr if none were configured
606 if (targ->flags & TASK_ARG_NDP) {
607 if (!memcmp(&targ->local_ipv6, &null_addr, sizeof(struct ipv6_addr))) {
608 set_link_local(&targ->local_ipv6);
609 set_EUI(&targ->local_ipv6, &port->eth_addr);
611 plog_info("\tCore %d, task %d, local IPv6 addr is "IPv6_BYTES_FMT" (%s)\n",
612 targ->lconf->id, targ->id,
613 IPv6_BYTES(targ->local_ipv6.bytes),
614 IP6_Canonical(&targ->local_ipv6));
615 memcpy(&tbase->l3.local_ipv6, &targ->local_ipv6, sizeof(struct ipv6_addr));
617 if (memcmp(&targ->global_ipv6, &null_addr, sizeof(struct ipv6_addr))) {
618 memcpy(&tbase->l3.global_ipv6, &targ->global_ipv6, sizeof(struct ipv6_addr));
619 plog_info("\tCore %d, task %d, global IPv6 addr is "IPv6_BYTES_FMT" (%s)\n",
620 targ->lconf->id, targ->id,
621 IPv6_BYTES(targ->global_ipv6.bytes),
622 IP6_Canonical(&targ->global_ipv6));
624 if (targ->ipv6_router)
625 register_router_to_ctrl_plane(tbase->l3.tmaster, tbase->l3.reachable_port_id, targ->lconf->id, targ->id, &targ->local_ipv6, &targ->global_ipv6, &targ->router_prefix);
627 register_node_to_ctrl_plane(tbase->l3.tmaster, &targ->local_ipv6, &targ->global_ipv6, tbase->l3.reachable_port_id, targ->lconf->id, targ->id);
631 struct rte_mempool *ret = rte_mempool_create(name, NB_ARP_ND_MBUF, ARP_ND_MBUF_SIZE, NB_CACHE_ARP_ND_MBUF,
632 sizeof(struct rte_pktmbuf_pool_private), rte_pktmbuf_pool_init, NULL, rte_pktmbuf_init, 0,
634 PROX_PANIC(ret == NULL, "Failed to allocate ARP/ND memory pool on socket %u with %u elements\n",
635 rte_socket_id(), NB_ARP_ND_MBUF);
636 plog_info("\tMempool %p (%s) size = %u * %u cache %u, socket %d (for ARP/ND)\n", ret, name, NB_ARP_ND_MBUF,
637 ARP_ND_MBUF_SIZE, NB_CACHE_ARP_ND_MBUF, rte_socket_id());
638 tbase->l3.arp_nd_pool = ret;
639 if ((targ->flags & TASK_ARG_NDP) && (!targ->ipv6_router)) {
640 plog_info("Sending Router Sollicitation\n");
641 send_router_sollicitation(tbase, targ);
643 if ((targ->flags & TASK_ARG_NDP) && (targ->flags & TASK_ARG_SEND_NA_AT_STARTUP)) {
644 plog_info("Sending unsollicited Neighbour Advertisement\n");
645 send_unsollicited_neighbour_advertisement(tbase);
651 void task_set_gateway_ip(struct task_base *tbase, uint32_t ip)
653 tbase->l3.gw.ip = ip;
654 tbase->flags &= ~FLAG_DST_MAC_KNOWN;
657 static void reset_arp_ndp_retransmit_timeout(struct l3_base *l3, uint32_t ip)
660 plogx_dbg("MAC entry for IP "IPv4_BYTES_FMT" timeout in kernel\n", IP4(ip));
663 int ret = rte_hash_lookup(l3->ip_hash, (const void *)&ip);
665 l3->arp_table[ret].arp_ndp_retransmit_timeout = 0;
666 } else if (ip == l3->gw.ip) {
667 l3->gw.arp_ndp_retransmit_timeout = 0;
668 } else if (l3->n_pkts < 4) {
669 for (idx = 0; idx < l3->n_pkts; idx++) {
670 uint32_t ip_dst = l3->optimized_arp_table[idx].ip;
674 if (idx < l3->n_pkts) {
675 l3->optimized_arp_table[idx].arp_ndp_retransmit_timeout = 0;
678 int ret = rte_hash_lookup(l3->ip_hash, (const void *)&ip);
680 l3->arp_table[ret].arp_ndp_retransmit_timeout = 0;
685 static prox_next_hop_index_type get_nh_index(struct task_base *tbase, uint32_t gw_ip)
687 // Check if gateway already exists
688 for (prox_next_hop_index_type i = 0; i < tbase->l3.nb_gws; i++) {
689 if (tbase->l3.next_hops[i].ip == gw_ip) {
693 if (tbase->l3.nb_gws < MAX_HOP_INDEX) {
694 tbase->l3.next_hops[tbase->l3.nb_gws].ip = gw_ip;
696 return tbase->l3.nb_gws - 1;
698 return MAX_HOP_INDEX;
700 void handle_ctrl_plane_pkts(struct task_base *tbase, struct rte_mbuf **mbufs, uint16_t n_pkts)
703 const uint64_t hz = rte_get_tsc_hz();
704 uint32_t ip, ip_dst, idx, gateway_ip, prefix;
705 prox_next_hop_index_type gateway_index;
706 int j, ret, modified_route;
708 struct ipv6_addr *ip6, *ip6_dst;
710 prox_rte_ether_hdr *hdr;
711 struct ether_hdr_arp *hdr_arp;
712 struct l3_base *l3 = &tbase->l3;
713 uint64_t tsc= rte_rdtsc();
714 uint64_t reachable_timeout = l3->reachable_timeout * hz / 1000;
716 prox_rte_ipv4_hdr *pip;
717 prox_rte_udp_hdr *udp_hdr;
718 uint8_t port = tbase->l3.reachable_port_id;
720 for (j = 0; j < n_pkts; ++j) {
723 for (j = 0; j < n_pkts; ++j) {
724 PREFETCH0(rte_pktmbuf_mtod(mbufs[j], void *));
727 for (j = 0; j < n_pkts; ++j) {
730 out[0] = OUT_HANDLED;
731 command = get_command(mbufs[j]);
732 plogx_dbg("\tReceived %s mbuf %p\n", actions_string[command], mbufs[j]);
734 case ROUTE_ADD_FROM_MASTER:
735 ip = ctrl_ring_get_ip(mbufs[j]);
736 gateway_ip = ctrl_ring_get_gateway_ip(mbufs[j]);
737 prefix = ctrl_ring_get_prefix(mbufs[j]);
738 gateway_index = get_nh_index(tbase, gateway_ip);
739 if (gateway_index >= MAX_HOP_INDEX) {
740 plog_err("Unable to find or define gateway index - too many\n");
743 modified_route = rte_lpm_is_rule_present(tbase->l3.ipv4_lpm, rte_bswap32(ip), prefix, &nh);
744 ret = rte_lpm_add(tbase->l3.ipv4_lpm, rte_bswap32(ip), prefix, gateway_index);
746 plog_err("Failed to add route to "IPv4_BYTES_FMT"/%d using "IPv4_BYTES_FMT"(index = %d)\n", IP4(ip), prefix, IP4(gateway_ip), gateway_index);
747 } else if (modified_route)
748 plogx_dbg("Modified route to "IPv4_BYTES_FMT"/%d using "IPv4_BYTES_FMT"(index = %d) (was using "IPv4_BYTES_FMT"(index = %d)\n", IP4(ip), prefix, IP4(gateway_ip), gateway_index, IP4(tbase->l3.next_hops[nh].ip), nh);
750 plogx_dbg("Added new route to "IPv4_BYTES_FMT"/%d using "IPv4_BYTES_FMT"(index = %d)\n", IP4(ip), prefix, IP4(gateway_ip), gateway_index);
754 case ROUTE_DEL_FROM_MASTER:
755 ip = ctrl_ring_get_ip(mbufs[j]);
756 prefix = ctrl_ring_get_prefix(mbufs[j]);
758 ret = rte_lpm_is_rule_present(tbase->l3.ipv4_lpm, rte_bswap32(ip), prefix, &nh);
760 ret = rte_lpm_delete(tbase->l3.ipv4_lpm, rte_bswap32(ip), prefix);
762 plog_err("Failed to add rule\n");
764 plog_info("Deleting route to "IPv4_BYTES_FMT"/%d\n", IP4(ip), prefix);
768 case MAC_INFO_FROM_MASTER:
769 hdr_arp = rte_pktmbuf_mtod(mbufs[j], struct ether_hdr_arp *);
770 ip = get_ip(mbufs[j]);
772 if (prox_rte_is_zero_ether_addr(&hdr_arp->arp.data.sha)) {
773 // MAC timeout or deleted from kernel table => reset update_time
774 // This will cause us to send new ARP request
775 // However, as reachable_timeout not touched, we should continue sending our regular IP packets
776 reset_arp_ndp_retransmit_timeout(l3, ip);
779 plogx_dbg("\tUpdating MAC entry for IP "IPv4_BYTES_FMT" with MAC "MAC_BYTES_FMT"\n",
780 IP4(ip), MAC_BYTES(hdr_arp->arp.data.sha.addr_bytes));
784 struct arp_table *entry;
785 ret = rte_hash_add_key(l3->ip_hash, (const void *)&ip);
787 plogx_info("Unable add ip "IPv4_BYTES_FMT" in mac_hash\n", IP4(ip));
788 } else if ((nh = l3->arp_table[ret].nh) != MAX_HOP_INDEX) {
789 entry = &l3->next_hops[nh];
790 memcpy(&entry->mac, &(hdr_arp->arp.data.sha), sizeof(prox_rte_ether_addr));
791 entry->reachable_timeout = tsc + reachable_timeout;
792 update_arp_ndp_retransmit_timeout(l3, &entry->arp_ndp_retransmit_timeout, l3->arp_ndp_retransmit_timeout);
794 memcpy(&l3->arp_table[ret].mac, &(hdr_arp->arp.data.sha), sizeof(prox_rte_ether_addr));
795 l3->arp_table[ret].reachable_timeout = tsc + reachable_timeout;
796 update_arp_ndp_retransmit_timeout(l3, &l3->arp_table[ret].arp_ndp_retransmit_timeout, l3->arp_ndp_retransmit_timeout);
799 else if (ip == l3->gw.ip) {
800 // MAC address of the gateway
801 memcpy(&l3->gw.mac, &hdr_arp->arp.data.sha, 6);
802 l3->flags |= FLAG_DST_MAC_KNOWN;
803 l3->gw.reachable_timeout = tsc + reachable_timeout;
804 update_arp_ndp_retransmit_timeout(l3, &l3->gw.arp_ndp_retransmit_timeout, l3->arp_ndp_retransmit_timeout);
805 } else if (l3->n_pkts < 4) {
806 // Few packets tracked - should be faster to loop through them thean using a hash table
807 for (idx = 0; idx < l3->n_pkts; idx++) {
808 ip_dst = l3->optimized_arp_table[idx].ip;
812 if (idx < l3->n_pkts) {
813 memcpy(&l3->optimized_arp_table[idx].mac, &(hdr_arp->arp.data.sha), sizeof(prox_rte_ether_addr));
814 l3->optimized_arp_table[idx].reachable_timeout = tsc + reachable_timeout;
815 update_arp_ndp_retransmit_timeout(l3, &l3->optimized_arp_table[idx].arp_ndp_retransmit_timeout, l3->arp_ndp_retransmit_timeout);
818 ret = rte_hash_add_key(l3->ip_hash, (const void *)&ip);
820 plogx_info("Unable add ip "IPv4_BYTES_FMT" in mac_hash\n", IP4(ip));
822 memcpy(&l3->arp_table[ret].mac, &(hdr_arp->arp.data.sha), sizeof(prox_rte_ether_addr));
823 l3->arp_table[ret].reachable_timeout = tsc + reachable_timeout;
824 update_arp_ndp_retransmit_timeout(l3, &l3->arp_table[ret].arp_ndp_retransmit_timeout, l3->arp_ndp_retransmit_timeout);
829 case MAC_INFO_FROM_MASTER_FOR_IPV6:
830 ip6 = ctrl_ring_get_ipv6_addr(mbufs[j]);
831 uint64_t data = ctrl_ring_get_data(mbufs[j]);
833 if (l3->n_pkts < 4) {
834 // Few packets tracked - should be faster to loop through them thean using a hash table
835 for (idx = 0; idx < l3->n_pkts; idx++) {
836 ip6_dst = &l3->optimized_arp_table[idx].ip6;
837 if (memcmp(ip6_dst, ip6, sizeof(struct ipv6_addr)) == 0)
840 if (idx < l3->n_pkts) {
841 // IP found; this is a reply for one of our requests!
842 memcpy(&l3->optimized_arp_table[idx].mac, &data, sizeof(prox_rte_ether_addr));
843 l3->optimized_arp_table[idx].reachable_timeout = tsc + l3->reachable_timeout * hz / 1000;
846 int ret = rte_hash_add_key(l3->ip6_hash, (const void *)ip6);
848 plogx_info("Unable add ip "IPv6_BYTES_FMT" in mac_hash\n", IPv6_BYTES(ip6->bytes));
850 memcpy(&l3->arp_table[ret].mac, &data, sizeof(prox_rte_ether_addr));
851 l3->arp_table[ret].reachable_timeout = tsc + l3->reachable_timeout * hz / 1000;
856 case SEND_NDP_FROM_MASTER:
857 case SEND_ARP_REQUEST_FROM_MASTER:
858 case SEND_ARP_REPLY_FROM_MASTER:
860 // tx_ctrlplane_pkt does not drop packets
861 plogx_dbg("\tForwarding (ARP) packet from master\n");
862 tbase->aux->tx_ctrlplane_pkt(tbase, &mbufs[j], 1, out);
863 TASK_STATS_ADD_TX_NON_DP(&tbase->aux->stats, 1);
865 case SEND_ICMP_FROM_MASTER:
867 // tx_ctrlplane_pkt does not drop packets
868 plogx_dbg("\tForwarding (PING) packet from master\n");
869 tbase->aux->tx_ctrlplane_pkt(tbase, &mbufs[j], 1, out);
870 TASK_STATS_ADD_TX_NON_DP(&tbase->aux->stats, 1);
873 // Drop Pseudo packets sent to generate ARP requests
874 // There are other IPv4 packets sent from TAP which we cannot delete e.g. BGP packets
876 hdr = rte_pktmbuf_mtod(mbufs[j], prox_rte_ether_hdr *);
877 if (hdr->ether_type == ETYPE_IPv4) {
878 pip = (prox_rte_ipv4_hdr *)(hdr + 1);
879 } else if (hdr->ether_type == ETYPE_VLAN) {
880 prox_rte_vlan_hdr *vlan = (prox_rte_vlan_hdr *)(hdr + 1);
881 vlan = (prox_rte_vlan_hdr *)(hdr + 1);
882 if (vlan->eth_proto == ETYPE_IPv4) {
883 pip = (prox_rte_ipv4_hdr *)(vlan + 1);
886 if (pip && (pip->next_proto_id == IPPROTO_UDP)) {
887 udp_hdr = (prox_rte_udp_hdr *)(pip + 1);
888 if ((udp_hdr->dst_port == rte_cpu_to_be_16(PROX_PSEUDO_PKT_PORT)) &&
889 (udp_hdr->src_port == rte_cpu_to_be_16(PROX_PSEUDO_PKT_PORT)) &&
890 (rte_be_to_cpu_16(udp_hdr->dgram_len) == 8)) {
891 plogx_dbg("Dropping PROX packet\n");
897 uint16_t src_port = 0, dst_port = 0, len = 0;
899 src_port = udp_hdr->src_port;
900 dst_port = udp_hdr->dst_port;
901 len = rte_be_to_cpu_16(udp_hdr->dgram_len);
903 plogx_dbg("tForwarding TAP packet from master. Type = %x, pip=%p, udp = %p, udp = {src = %x, dst = %x, len = %d}\n", hdr->ether_type, pip, udp_hdr, src_port, dst_port,len );
905 // tx_ctrlplane_pkt does not drop packets
906 tbase->aux->tx_ctrlplane_pkt(tbase, &mbufs[j], 1, out);
907 TASK_STATS_ADD_TX_NON_DP(&tbase->aux->stats, 1);
909 case IPV6_INFO_FROM_MASTER:
910 // addr = ctrl_ring_get_data(mbufs[j]);
911 ip6 = ctrl_ring_get_ipv6_addr(mbufs[j]);
912 if (memcmp(&l3->global_ipv6 , &null_addr, 16) == 0) {
913 memcpy(&l3->global_ipv6, ip6, sizeof(struct ipv6_addr));
914 plog_info("Core %d task %d received global IP "IPv6_BYTES_FMT"\n", l3->core_id, l3->task_id, IPv6_BYTES(ip6->bytes));
915 } else if (memcmp(&l3->global_ipv6, ip6, 8) == 0) {
916 if (l3->prefix_printed == 0) {
917 plog_info("Core %d task %d received expected prefix "IPv6_PREFIX_FMT"\n", l3->core_id, l3->task_id, IPv6_PREFIX(ip6->bytes));
918 l3->prefix_printed = 1;
921 plog_warn("Core %d task %d received unexpected prefix "IPv6_PREFIX_FMT", IP = "IPv6_PREFIX_FMT"\n", l3->core_id, l3->task_id, IPv6_PREFIX(ip6->bytes), IPv6_PREFIX(l3->global_ipv6.bytes));
926 plog_err("Unexpected message received: %d\n", command);