2 // Copyright (c) 2010-2020 Intel Corporation
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
17 #include <rte_lcore.h>
19 #include <rte_hash_crc.h>
22 #include "task_base.h"
27 #include "handle_master.h"
28 #include "prox_port_cfg.h"
29 #include "packet_utils.h"
30 #include "prox_shared.h"
32 #include "hash_entry_types.h"
33 #include "prox_compat.h"
36 #include "prox_ipv6.h"
39 static inline int find_ip(struct ether_hdr_arp *pkt, uint16_t len, uint32_t *ip_dst, uint16_t *vlan)
41 prox_rte_vlan_hdr *vlan_hdr;
42 prox_rte_ether_hdr *eth_hdr = (prox_rte_ether_hdr*)pkt;
43 prox_rte_ipv4_hdr *ip;
44 uint16_t ether_type = eth_hdr->ether_type;
45 uint16_t l2_len = sizeof(prox_rte_ether_hdr);
49 while (((ether_type == ETYPE_8021ad) || (ether_type == ETYPE_VLAN)) && (l2_len + sizeof(prox_rte_vlan_hdr) < len)) {
50 vlan_hdr = (prox_rte_vlan_hdr *)((uint8_t *)pkt + l2_len);
52 ether_type = vlan_hdr->eth_proto;
53 *vlan = rte_be_to_cpu_16(vlan_hdr->vlan_tci & 0xFF0F); // Store VLAN, or CVLAN if QinQ
59 // In case of MPLS, next hop MAC is based on MPLS, not destination IP
71 plog_warn("Unsupported packet type %x - CRC might be wrong\n", ether_type);
75 if (l2_len && (l2_len + sizeof(prox_rte_ipv4_hdr) <= len)) {
76 prox_rte_ipv4_hdr *ip = (prox_rte_ipv4_hdr *)((uint8_t *)pkt + l2_len);
77 // TODO: implement LPM => replace ip_dst by next hop IP DST
78 *ip_dst = ip->dst_addr;
84 static inline void find_vlan(struct ether_hdr_arp *pkt, uint16_t len, uint16_t *vlan)
86 prox_rte_vlan_hdr *vlan_hdr;
87 prox_rte_ether_hdr *eth_hdr = (prox_rte_ether_hdr*)pkt;
88 uint16_t ether_type = eth_hdr->ether_type;
89 uint16_t l2_len = sizeof(prox_rte_ether_hdr);
93 while (((ether_type == ETYPE_8021ad) || (ether_type == ETYPE_VLAN)) && (l2_len + sizeof(prox_rte_vlan_hdr) < len)) {
94 vlan_hdr = (prox_rte_vlan_hdr *)((uint8_t *)pkt + l2_len);
96 ether_type = vlan_hdr->eth_proto;
97 *vlan = rte_be_to_cpu_16(vlan_hdr->vlan_tci & 0xFF0F); // Store VLAN, or CVLAN if QinQ
101 static inline struct ipv6_addr *find_ip6(prox_rte_ether_hdr *pkt, uint16_t len, struct ipv6_addr *ip_dst, uint16_t *vlan)
103 prox_rte_vlan_hdr *vlan_hdr;
104 prox_rte_ipv6_hdr *ip;
105 uint16_t ether_type = pkt->ether_type;
106 uint16_t l2_len = sizeof(prox_rte_ether_hdr);
110 while (((ether_type == ETYPE_8021ad) || (ether_type == ETYPE_VLAN)) && (l2_len + sizeof(prox_rte_vlan_hdr) < len)) {
111 vlan_hdr = (prox_rte_vlan_hdr *)((uint8_t *)pkt + l2_len);
113 ether_type = vlan_hdr->eth_proto;
114 *vlan = rte_be_to_cpu_16(vlan_hdr->vlan_tci & 0xFF0F); // Store VLAN, or CVLAN if QinQ
117 switch (ether_type) {
120 // In case of MPLS, next hop MAC is based on MPLS, not destination IP
132 plog_warn("Unsupported packet type %x - CRC might be wrong\n", ether_type);
136 if (l2_len && (l2_len + sizeof(prox_rte_ipv6_hdr) <= len)) {
137 prox_rte_ipv6_hdr *ip = (prox_rte_ipv6_hdr *)((uint8_t *)pkt + l2_len);
138 // TODO: implement LPM => replace ip_dst by next hop IP DST
139 memcpy(ip_dst, &ip->dst_addr, sizeof(struct ipv6_addr));
140 return (struct ipv6_addr *)&ip->src_addr;
145 void send_unsollicited_neighbour_advertisement(struct task_base *tbase)
148 uint8_t out = 0, port_id = tbase->l3.reachable_port_id;
149 struct rte_mbuf *mbuf = NULL;
151 if (memcmp(&tbase->l3.local_ipv6, &null_addr, 16) != 0) {
152 ret = rte_mempool_get(tbase->l3.arp_nd_pool, (void **)&mbuf);
153 if (likely(ret == 0)) {
154 mbuf->port = port_id;
155 build_neighbour_advertisement(tbase->l3.tmaster, mbuf, &prox_port_cfg[port_id].eth_addr, &tbase->l3.local_ipv6, PROX_UNSOLLICITED);
156 tbase->aux->tx_ctrlplane_pkt(tbase, &mbuf, 1, &out);
157 TASK_STATS_ADD_TX_NON_DP(&tbase->aux->stats, 1);
159 plog_err("Failed to get a mbuf from arp/ndp mempool\n");
163 if (memcmp(&tbase->l3.global_ipv6, &null_addr, 16) != 0) {
164 ret = rte_mempool_get(tbase->l3.arp_nd_pool, (void **)&mbuf);
165 if (likely(ret == 0)) {
166 mbuf->port = port_id;
167 build_neighbour_advertisement(tbase->l3.tmaster, mbuf, &prox_port_cfg[port_id].eth_addr, &tbase->l3.global_ipv6, PROX_UNSOLLICITED);
168 tbase->aux->tx_ctrlplane_pkt(tbase, &mbuf, 1, &out);
169 TASK_STATS_ADD_TX_NON_DP(&tbase->aux->stats, 1);
171 plog_err("Failed to get a mbuf from arp/ndp mempool\n");
176 plog_err("No neighbor advertisement sent as no local or global ipv6\n");
180 static void send_router_sollicitation(struct task_base *tbase, struct task_args *targ)
183 uint8_t out = 0, port_id = tbase->l3.reachable_port_id;
184 struct rte_mbuf *mbuf;
186 ret = rte_mempool_get(tbase->l3.arp_nd_pool, (void **)&mbuf);
187 if (likely(ret == 0)) {
188 mbuf->port = port_id;
189 build_router_sollicitation(mbuf, &prox_port_cfg[port_id].eth_addr, &targ->local_ipv6);
190 tbase->aux->tx_ctrlplane_pkt(tbase, &mbuf, 1, &out);
191 TASK_STATS_ADD_TX_NON_DP(&tbase->aux->stats, 1);
193 plog_err("Failed to get a mbuf from arp/ndp mempool\n");
197 /* This implementation could be improved: instead of checking each time we send a packet whether we need also
198 to send an ARP, we should only check whether the MAC is valid.
199 We should check arp_ndp_retransmit_timeout in the master process. This would also require the generating task to clear its arp ring
200 to avoid sending many ARP while starting after a long stop.
201 We could also check for reachable_timeout in the master so that dataplane has only to check whether MAC is available
202 but this would require either thread safety, or the the exchange of information between master and generating core.
205 static inline int add_key_and_send_arp(struct rte_hash *ip_hash, uint32_t *ip_dst, struct arp_table *entries, uint64_t tsc, uint64_t hz, uint32_t arp_ndp_retransmit_timeout, prox_next_hop_index_type nh, uint64_t **time)
207 int ret = rte_hash_add_key(ip_hash, (const void *)ip_dst);
208 if (unlikely(ret < 0)) {
209 // No reason to send ARP, as reply would be anyhow ignored
210 plogx_err("Unable to add ip "IPv4_BYTES_FMT" in mac_hash\n", IP4(*ip_dst));
213 entries[ret].ip = *ip_dst;
214 entries[ret].nh = nh;
215 *time = &entries[ret].arp_ndp_retransmit_timeout;
220 static inline int update_mac_and_send_mbuf(struct arp_table *entry, prox_rte_ether_addr *mac, uint64_t tsc, uint64_t hz, uint32_t arp_ndp_retransmit_timeout, uint64_t **time)
222 if (likely((tsc < entry->arp_ndp_retransmit_timeout) && (tsc < entry->reachable_timeout))) {
223 memcpy(mac, &entry->mac, sizeof(prox_rte_ether_addr));
225 } else if (tsc > entry->arp_ndp_retransmit_timeout) {
226 // long time since we have sent an arp, send arp
227 *time = &entry->arp_ndp_retransmit_timeout;
228 if (tsc < entry->reachable_timeout){
229 // MAC is valid in the table => send also the mbuf
230 memcpy(mac, &entry->mac, sizeof(prox_rte_ether_addr));
231 return SEND_MBUF_AND_ARP_ND;
233 // MAC still unknown, or timed out => only send ARP
237 // MAC is unknown and we already sent an ARP recently, drop mbuf and wait for ARP reply
241 int write_dst_mac(struct task_base *tbase, struct rte_mbuf *mbuf, uint32_t *ip_dst, uint16_t *vlan, uint64_t **time, uint64_t tsc)
243 const uint64_t hz = rte_get_tsc_hz();
244 struct ether_hdr_arp *packet = rte_pktmbuf_mtod(mbuf, struct ether_hdr_arp *);
245 prox_rte_ether_addr *mac = &packet->ether_hdr.d_addr;
246 prox_next_hop_index_type next_hop_index;
247 static uint64_t last_tsc = 0, n_no_route = 0;
249 struct l3_base *l3 = &(tbase->l3);
251 // First find the next hop
253 // A routing table was configured
254 // If a gw (gateway_ipv4) is also specified, it is used as default gw only i.e. lowest priority (shortest prefix)
255 // This is implemented automatically through lpm
256 uint16_t len = rte_pktmbuf_pkt_len(mbuf);
257 if (find_ip(packet, len, ip_dst, vlan) != 0) {
258 // Unable to find IP address => non IP packet => send it as it
261 if (unlikely(rte_lpm_lookup(l3->ipv4_lpm, rte_bswap32(*ip_dst), &next_hop_index) != 0)) {
262 // Prevent printing too many messages
264 if (tsc > last_tsc + rte_get_tsc_hz()) {
265 plogx_err("No route to IP "IPv4_BYTES_FMT" (%ld times)\n", IP4(*ip_dst), n_no_route);
271 struct arp_table *entry = &l3->next_hops[next_hop_index];
275 return update_mac_and_send_mbuf(entry, mac, tsc, hz, l3->arp_ndp_retransmit_timeout, time);
278 // no next ip: this is a local route
279 // Find IP in lookup table. Send ARP if not found
280 int ret = rte_hash_lookup(l3->ip_hash, (const void *)ip_dst);
281 if (unlikely(ret < 0)) {
282 // IP not found, try to send an ARP
283 return add_key_and_send_arp(l3->ip_hash, ip_dst, l3->arp_table, tsc, hz, l3->arp_ndp_retransmit_timeout, MAX_HOP_INDEX, time);
285 return update_mac_and_send_mbuf(&l3->arp_table[ret], mac, tsc, hz, l3->arp_ndp_retransmit_timeout, time);
289 // No Routing table specified: only a local ip and maybe a gateway
290 // Old default behavior: if a gw is specified, ALL packets go to this gateway (even those we could send w/o the gw
292 uint16_t len = rte_pktmbuf_pkt_len(mbuf);
294 find_vlan(packet, len, vlan);
295 if (likely((l3->flags & FLAG_DST_MAC_KNOWN) && (tsc < l3->gw.arp_ndp_retransmit_timeout) && (tsc < l3->gw.reachable_timeout))) {
296 memcpy(mac, &l3->gw.mac, sizeof(prox_rte_ether_addr));
298 } else if (tsc > l3->gw.arp_ndp_retransmit_timeout) {
299 // long time since we have successfully sent an arp, send arp
300 // If sending ARP failed (ring full) then arp_ndp_retransmit_timeout is not updated to avoid having to wait 1 sec to send ARP REQ again
301 *time = &l3->gw.arp_ndp_retransmit_timeout;
302 l3->gw.arp_ndp_retransmit_timeout = tsc + l3->arp_ndp_retransmit_timeout * hz / 1000;
305 if ((l3->flags & FLAG_DST_MAC_KNOWN) && (tsc < l3->gw.reachable_timeout)){
306 // MAC is valid in the table => send also the mbuf
307 memcpy(mac, &l3->gw.mac, sizeof(prox_rte_ether_addr));
308 return SEND_MBUF_AND_ARP_ND;
310 // MAC still unknown, or timed out => only send ARP
314 // MAC is unknown and we already sent an ARP recently, drop mbuf and wait for ARP reply
319 if (find_ip(packet, len, ip_dst, vlan) != 0) {
320 // Unable to find IP address => non IP packet => send it as it
323 if (likely(l3->n_pkts < 4)) {
324 for (unsigned int idx = 0; idx < l3->n_pkts; idx++) {
325 if (*ip_dst == l3->optimized_arp_table[idx].ip) {
326 return update_mac_and_send_mbuf(&l3->optimized_arp_table[idx], mac, tsc, hz, l3->arp_ndp_retransmit_timeout, time);
329 // IP address not found in table
330 l3->optimized_arp_table[l3->n_pkts].ip = *ip_dst;
331 *time = &l3->optimized_arp_table[l3->n_pkts].arp_ndp_retransmit_timeout;
334 if (l3->n_pkts < 4) {
338 // We have too many IP addresses to search linearly; lets use hash table instead => copy all entries in hash table
339 for (uint32_t idx = 0; idx < l3->n_pkts; idx++) {
340 uint32_t ip = l3->optimized_arp_table[idx].ip;
341 int ret = rte_hash_add_key(l3->ip_hash, (const void *)&ip);
343 // This should not happen as few entries so far.
344 // If it happens, we still send the ARP as easier:
345 // If the ARP corresponds to this error, the ARP reply will be ignored
346 // If ARP does not correspond to this error/ip, then ARP reply will be handled.
347 plogx_err("Unable add ip "IPv4_BYTES_FMT" in mac_hash (already %d entries)\n", IP4(ip), idx);
349 memcpy(&l3->arp_table[ret], &l3->optimized_arp_table[idx], sizeof(struct arp_table));
354 // Find IP in lookup table. Send ARP if not found
355 int ret = rte_hash_lookup(l3->ip_hash, (const void *)ip_dst);
356 if (unlikely(ret < 0)) {
357 // IP not found, try to send an ARP
358 return add_key_and_send_arp(l3->ip_hash, ip_dst, &l3->arp_table[ret], tsc, hz, l3->arp_ndp_retransmit_timeout, MAX_HOP_INDEX, time);
361 return update_mac_and_send_mbuf(&l3->arp_table[ret], mac, tsc, hz, l3->arp_ndp_retransmit_timeout, time);
368 int write_ip6_dst_mac(struct task_base *tbase, struct rte_mbuf *mbuf, struct ipv6_addr *ip_dst, uint16_t *vlan)
370 const uint64_t hz = rte_get_tsc_hz();
371 prox_rte_ether_hdr *packet = rte_pktmbuf_mtod(mbuf, prox_rte_ether_hdr *);
372 prox_rte_ether_addr *mac = &packet->d_addr;
373 struct ipv6_addr *used_ip_src;
375 uint64_t tsc = rte_rdtsc();
376 uint16_t len = rte_pktmbuf_pkt_len(mbuf);
378 struct ipv6_addr *pkt_src_ip6;
379 if ((pkt_src_ip6 = find_ip6(packet, len, ip_dst, vlan)) == NULL) {
380 // Unable to find IP address => non IP packet => send it as it
383 struct l3_base *l3 = &(tbase->l3);
385 // Configure source IP
386 if (memcmp(&l3->local_ipv6, ip_dst, 8) == 0) {
387 // Same prefix as local -> use local
388 used_ip_src = &l3->local_ipv6;
389 } else if (memcmp(&l3->global_ipv6 , ip_dst, 8) == 0) {
390 // Same prefix as global -> use global
391 used_ip_src = &l3->global_ipv6;
392 } else if (memcmp(&l3->gw.ip6 , &null_addr, sizeof(struct ipv6_addr)) != 0) {
393 used_ip_src = &l3->global_ipv6;
394 memcpy(ip_dst, &l3->gw.ip6, sizeof(struct ipv6_addr));
395 } else if (memcmp(&l3->global_ipv6 , &null_addr, sizeof(struct ipv6_addr)) != 0) {
396 // Global IP is defined -> use it
397 used_ip_src = &l3->global_ipv6;
399 plog_info("Error as trying to send a packet to "IPv6_BYTES_FMT" using "IPv6_BYTES_FMT" (local)\n", IPv6_BYTES(ip_dst->bytes), IPv6_BYTES(l3->local_ipv6.bytes));
402 memcpy(pkt_src_ip6, used_ip_src, sizeof(struct ipv6_addr));
405 if (likely(l3->n_pkts < 4)) {
406 for (unsigned int idx = 0; idx < l3->n_pkts; idx++) {
407 if (memcmp(ip_dst, &l3->optimized_arp_table[idx].ip6, sizeof(struct ipv6_addr)) == 0) {
408 // IP address already in table
409 if ((tsc < l3->optimized_arp_table[idx].arp_ndp_retransmit_timeout) && (tsc < l3->optimized_arp_table[idx].reachable_timeout)) {
410 // MAC address was recently updated in table, use it
411 // plog_dbg("Valid MAC address found => send packet\n");
412 memcpy(mac, &l3->optimized_arp_table[idx].mac, sizeof(prox_rte_ether_addr));
414 } else if (tsc > l3->optimized_arp_table[idx].arp_ndp_retransmit_timeout) {
415 // NDP not sent since a long time, send NDP
416 l3->optimized_arp_table[idx].arp_ndp_retransmit_timeout = tsc + l3->arp_ndp_retransmit_timeout * hz / 1000;
417 if (tsc < l3->optimized_arp_table[idx].reachable_timeout) {
418 // MAC still valid => also send mbuf
419 plog_dbg("Valid MAC found but NDP retransmit timeout => send packet and NDP\n");
420 memcpy(mac, &l3->optimized_arp_table[idx].mac, sizeof(prox_rte_ether_addr));
421 return SEND_MBUF_AND_ARP_ND;
423 plog_dbg("Unknown MAC => send NDP but cannot send packet\n");
424 // MAC unvalid => only send NDP
428 // NDP timeout elapsed, MAC not valid anymore but waiting for NDP reply
429 // plog_dbg("NDP reachable timeout elapsed - waiting for NDP reply\n");
434 // IP address not found in table
435 memcpy(&l3->optimized_arp_table[l3->n_pkts].ip6, ip_dst, sizeof(struct ipv6_addr));
436 l3->optimized_arp_table[l3->n_pkts].arp_ndp_retransmit_timeout = tsc + l3->arp_ndp_retransmit_timeout * hz / 1000;
439 if (l3->n_pkts < 4) {
443 // We have too many IP addresses to search linearly; lets use hash table instead => copy all entries in hash table
444 for (uint32_t idx = 0; idx < l3->n_pkts; idx++) {
445 struct ipv6_addr *ip6 = &l3->optimized_arp_table[idx].ip6;
446 int ret = rte_hash_add_key(l3->ip6_hash, (const void *)ip6);
448 // This should not happen as few entries so far.
449 // If it happens, we still send the NDP as easier:
450 // If the NDP corresponds to this error, the NDP reply will be ignored
451 // If NDP does not correspond to this error/ip, then NDP reply will be handled.
452 plogx_err("Unable add ip "IPv6_BYTES_FMT" in mac_hash (already %d entries)\n", IPv6_BYTES(ip6->bytes), idx);
454 memcpy(&l3->arp_table[ret], &l3->optimized_arp_table[idx], sizeof(struct arp_table));
459 // Find IP in lookup table. Send ND if not found
460 int ret = rte_hash_lookup(l3->ip6_hash, (const void *)ip_dst);
461 if (unlikely(ret < 0)) {
462 // IP not found, try to send an ND
463 int ret = rte_hash_add_key(l3->ip6_hash, (const void *)ip_dst);
465 // No reason to send NDP, as reply would be anyhow ignored
466 plogx_err("Unable to add ip "IPv6_BYTES_FMT" in mac_hash\n", IPv6_BYTES(ip_dst->bytes));
469 memcpy(&l3->arp_table[ret].ip6, ip_dst, sizeof(struct ipv6_addr));
470 l3->arp_table[ret].arp_ndp_retransmit_timeout = tsc + l3->arp_ndp_retransmit_timeout * hz / 1000;
475 if (likely((tsc < l3->arp_table[ret].arp_ndp_retransmit_timeout) && (tsc < l3->arp_table[ret].reachable_timeout))) {
476 // MAC still valid and NDP sent recently
477 memcpy(mac, &l3->arp_table[ret].mac, sizeof(prox_rte_ether_addr));
479 } else if (tsc > l3->arp_table[ret].arp_ndp_retransmit_timeout) {
480 // NDP not sent since a long time, send NDP
481 l3->arp_table[ret].arp_ndp_retransmit_timeout = tsc + l3->arp_ndp_retransmit_timeout * hz / 1000;
482 if (tsc < l3->arp_table[ret].reachable_timeout) {
483 // MAC still valid => send also MBUF
484 memcpy(mac, &l3->arp_table[ret].mac, sizeof(prox_rte_ether_addr));
485 return SEND_MBUF_AND_ARP_ND;
498 void task_init_l3(struct task_base *tbase, struct task_args *targ)
500 static char hash_name[30];
501 uint32_t n_entries = MAX_ARP_ENTRIES * 4;
502 const int socket_id = rte_lcore_to_socket_id(targ->lconf->id);
503 sprintf(hash_name, "A%03d_%03d_mac_table", targ->lconf->id, targ->id);
507 struct rte_hash_parameters hash_params = {
509 .entries = n_entries,
510 .key_len = sizeof(uint32_t),
511 .hash_func = rte_hash_crc,
512 .hash_func_init_val = 0,
514 if (targ->flags & TASK_ARG_L3) {
515 plog_info("\t\tInitializing L3 (IPv4)\n");
516 tbase->l3.ip_hash = rte_hash_create(&hash_params);
517 PROX_PANIC(tbase->l3.ip_hash == NULL, "Failed to set up ip hash table\n");
521 if (targ->flags & TASK_ARG_NDP) {
522 plog_info("\t\tInitializing NDP (IPv6)\n");
523 hash_params.key_len = sizeof(struct ipv6_addr);
524 tbase->l3.ip6_hash = rte_hash_create(&hash_params);
525 PROX_PANIC(tbase->l3.ip6_hash == NULL, "Failed to set up ip hash table\n");
527 tbase->l3.arp_table = (struct arp_table *)prox_zmalloc(n_entries * sizeof(struct arp_table), socket_id);
528 PROX_PANIC(tbase->l3.arp_table == NULL, "Failed to allocate memory for %u entries in arp/ndp table\n", n_entries);
529 plog_info("\t\tarp/ndp table, with %d entries of size %ld\n", n_entries, sizeof(struct l3_base));
531 targ->lconf->ctrl_func_p[targ->task] = handle_ctrl_plane_pkts;
532 targ->lconf->ctrl_timeout = freq_to_tsc(targ->ctrl_freq);
533 tbase->l3.gw.ip = rte_cpu_to_be_32(targ->gateway_ipv4);
534 memcpy(&tbase->l3.gw.ip6, &targ->gateway_ipv6, sizeof(struct ipv6_addr));
535 tbase->flags |= TASK_L3;
536 tbase->l3.core_id = targ->lconf->id;
537 tbase->l3.task_id = targ->id;
538 tbase->l3.tmaster = targ->tmaster;
539 tbase->l3.seed = (uint)rte_rdtsc();
540 if (targ->reachable_timeout != 0)
541 tbase->l3.reachable_timeout = targ->reachable_timeout;
543 tbase->l3.reachable_timeout = DEFAULT_ARP_TIMEOUT;
544 if (targ->arp_ndp_retransmit_timeout != 0)
545 tbase->l3.arp_ndp_retransmit_timeout = targ->arp_ndp_retransmit_timeout;
547 tbase->l3.arp_ndp_retransmit_timeout = DEFAULT_ARP_UPDATE_TIME;
550 void task_start_l3(struct task_base *tbase, struct task_args *targ)
552 const int socket_id = rte_lcore_to_socket_id(targ->lconf->id);
553 const int NB_ARP_ND_MBUF = 1024;
554 const int ARP_ND_MBUF_SIZE = 2048;
555 const int NB_CACHE_ARP_ND_MBUF = 256;
557 struct prox_port_cfg *port = find_reachable_port(targ);
558 if (port && (tbase->l3.arp_nd_pool == NULL)) {
559 static char name[] = "arp0_pool";
560 tbase->l3.reachable_port_id = port - prox_port_cfg;
561 if ((targ->local_ipv4 && port->ip) && (targ->local_ipv4 != port->ip)) {
562 PROX_PANIC(1, "local_ipv4 in core section ("IPv4_BYTES_FMT") differs from port section ("IPv4_BYTES_FMT")\n", IP4(rte_be_to_cpu_32(targ->local_ipv4)), IP4(rte_be_to_cpu_32(port->ip)));
564 if ((targ->local_ipv4 && port->ip) && (targ->local_prefix != port->prefix)) {
565 PROX_PANIC(1, "local_ipv4 prefix in core section (%d) differs from port section (%d)\n", targ->local_prefix, port->prefix);
567 if (!targ->local_ipv4) {
568 targ->local_ipv4 = port->ip;
569 targ->local_prefix = port->prefix;
570 plog_info("Setting core local_ipv4 from port %d local_ipv4 to "IPv4_BYTES_FMT"\n", tbase->l3.reachable_port_id, IP4(rte_be_to_cpu_32(port->ip)));
572 if (targ->local_ipv4) {
573 tbase->l3.local_ipv4 = rte_be_to_cpu_32(targ->local_ipv4);
574 register_ip_to_ctrl_plane(tbase->l3.tmaster, tbase->l3.local_ipv4, tbase->l3.reachable_port_id, targ->lconf->id, targ->id);
576 if (strcmp(targ->route_table, "") != 0) {
580 PROX_PANIC(tbase->l3.local_ipv4 == 0, "missing local_ipv4 while route table is specified in L3 mode\n");
582 // LPM might be modified runtime => do not share with other cores
583 ret = lua_to_lpm4(prox_lua(), GLOBAL, targ->route_table, socket_id, &lpm);
584 PROX_PANIC(ret, "Failed to load IPv4 LPM:\n%s\n", get_lua_to_errors());
586 tbase->l3.ipv4_lpm = lpm->rte_lpm;
587 tbase->l3.next_hops = prox_zmalloc(sizeof(*tbase->l3.next_hops) * MAX_HOP_INDEX, socket_id);
588 PROX_PANIC(tbase->l3.next_hops == NULL, "Could not allocate memory for next hop\n");
590 for (uint32_t i = 0; i < MAX_HOP_INDEX; i++) {
591 if (!lpm->next_hops[i].ip_dst)
594 tbase->l3.next_hops[i].ip = rte_bswap32(lpm->next_hops[i].ip_dst);
595 int tx_port = lpm->next_hops[i].mac_port.out_idx;
596 // gen only supports one port right now .... hence port = 0
597 if ((tx_port > targ->nb_txports - 1) && (tx_port > targ->nb_txrings - 1)) {
598 PROX_PANIC(1, "Routing Table contains port %d but only %d tx port/ %d ring:\n", tx_port, targ->nb_txports, targ->nb_txrings);
601 plog_info("Using routing table %s in l3 mode, with %d gateways\n", targ->route_table, tbase->l3.nb_gws);
603 // Last but one "next_hop_index" is not a gateway but direct routes
604 tbase->l3.next_hops[tbase->l3.nb_gws].ip = 0;
605 ret = rte_lpm_add(tbase->l3.ipv4_lpm, targ->local_ipv4, targ->local_prefix, tbase->l3.nb_gws++);
606 PROX_PANIC(ret, "Failed to add local_ipv4 "IPv4_BYTES_FMT"/%d to lpm\n", IP4(tbase->l3.local_ipv4), targ->local_prefix);
607 // Last "next_hop_index" is default gw
608 tbase->l3.next_hops[tbase->l3.nb_gws].ip = rte_bswap32(targ->gateway_ipv4);
609 if (targ->gateway_ipv4) {
610 ret = rte_lpm_add(tbase->l3.ipv4_lpm, targ->gateway_ipv4, 0, tbase->l3.nb_gws++);
611 PROX_PANIC(ret, "Failed to add gateway_ipv4 "IPv4_BYTES_FMT"/%d to lpm\n", IP4(tbase->l3.gw.ip), 0);
615 master_init_vdev(tbase->l3.tmaster, tbase->l3.reachable_port_id, targ->lconf->id, targ->id);
617 // Create IPv6 addr if none were configured
618 if (targ->flags & TASK_ARG_NDP) {
619 if (!memcmp(&targ->local_ipv6, &null_addr, sizeof(struct ipv6_addr))) {
620 set_link_local(&targ->local_ipv6);
621 set_EUI(&targ->local_ipv6, &port->eth_addr);
623 plog_info("\tCore %d, task %d, local IPv6 addr is "IPv6_BYTES_FMT" (%s)\n",
624 targ->lconf->id, targ->id,
625 IPv6_BYTES(targ->local_ipv6.bytes),
626 IP6_Canonical(&targ->local_ipv6));
627 memcpy(&tbase->l3.local_ipv6, &targ->local_ipv6, sizeof(struct ipv6_addr));
629 if (memcmp(&targ->global_ipv6, &null_addr, sizeof(struct ipv6_addr))) {
630 memcpy(&tbase->l3.global_ipv6, &targ->global_ipv6, sizeof(struct ipv6_addr));
631 plog_info("\tCore %d, task %d, global IPv6 addr is "IPv6_BYTES_FMT" (%s)\n",
632 targ->lconf->id, targ->id,
633 IPv6_BYTES(targ->global_ipv6.bytes),
634 IP6_Canonical(&targ->global_ipv6));
636 if (targ->ipv6_router)
637 register_router_to_ctrl_plane(tbase->l3.tmaster, tbase->l3.reachable_port_id, targ->lconf->id, targ->id, &targ->local_ipv6, &targ->global_ipv6, &targ->router_prefix);
639 register_node_to_ctrl_plane(tbase->l3.tmaster, &targ->local_ipv6, &targ->global_ipv6, tbase->l3.reachable_port_id, targ->lconf->id, targ->id);
643 struct rte_mempool *ret = rte_mempool_create(name, NB_ARP_ND_MBUF, ARP_ND_MBUF_SIZE, NB_CACHE_ARP_ND_MBUF,
644 sizeof(struct rte_pktmbuf_pool_private), rte_pktmbuf_pool_init, NULL, rte_pktmbuf_init, 0,
646 PROX_PANIC(ret == NULL, "Failed to allocate ARP/ND memory pool on socket %u with %u elements\n",
647 rte_socket_id(), NB_ARP_ND_MBUF);
648 plog_info("\tMempool %p (%s) size = %u * %u cache %u, socket %d (for ARP/ND)\n", ret, name, NB_ARP_ND_MBUF,
649 ARP_ND_MBUF_SIZE, NB_CACHE_ARP_ND_MBUF, rte_socket_id());
650 tbase->l3.arp_nd_pool = ret;
651 if ((targ->flags & TASK_ARG_NDP) && (!targ->ipv6_router)) {
652 plog_info("Sending Router Sollicitation\n");
653 send_router_sollicitation(tbase, targ);
655 if ((targ->flags & TASK_ARG_NDP) && (targ->flags & TASK_ARG_SEND_NA_AT_STARTUP)) {
656 plog_info("Sending unsollicited Neighbour Advertisement\n");
657 send_unsollicited_neighbour_advertisement(tbase);
663 void task_set_gateway_ip(struct task_base *tbase, uint32_t ip)
665 tbase->l3.gw.ip = ip;
666 tbase->flags &= ~FLAG_DST_MAC_KNOWN;
669 void task_set_local_ip(struct task_base *tbase, uint32_t ip)
671 tbase->l3.local_ipv4 = ip;
674 static void reset_arp_ndp_retransmit_timeout(struct l3_base *l3, uint32_t ip)
677 plogx_dbg("MAC entry for IP "IPv4_BYTES_FMT" timeout in kernel\n", IP4(ip));
680 int ret = rte_hash_lookup(l3->ip_hash, (const void *)&ip);
682 l3->arp_table[ret].arp_ndp_retransmit_timeout = 0;
683 } else if (ip == l3->gw.ip) {
684 l3->gw.arp_ndp_retransmit_timeout = 0;
685 } else if (l3->n_pkts < 4) {
686 for (idx = 0; idx < l3->n_pkts; idx++) {
687 uint32_t ip_dst = l3->optimized_arp_table[idx].ip;
691 if (idx < l3->n_pkts) {
692 l3->optimized_arp_table[idx].arp_ndp_retransmit_timeout = 0;
695 int ret = rte_hash_lookup(l3->ip_hash, (const void *)&ip);
697 l3->arp_table[ret].arp_ndp_retransmit_timeout = 0;
702 static prox_next_hop_index_type get_nh_index(struct task_base *tbase, uint32_t gw_ip)
704 // Check if gateway already exists
705 for (prox_next_hop_index_type i = 0; i < tbase->l3.nb_gws; i++) {
706 if (tbase->l3.next_hops[i].ip == gw_ip) {
710 if (tbase->l3.nb_gws < MAX_HOP_INDEX) {
711 tbase->l3.next_hops[tbase->l3.nb_gws].ip = gw_ip;
713 return tbase->l3.nb_gws - 1;
715 return MAX_HOP_INDEX;
717 void handle_ctrl_plane_pkts(struct task_base *tbase, struct rte_mbuf **mbufs, uint16_t n_pkts)
720 const uint64_t hz = rte_get_tsc_hz();
721 uint32_t ip, ip_dst, idx, gateway_ip, prefix;
722 prox_next_hop_index_type gateway_index;
723 int j, ret, modified_route;
725 struct ipv6_addr *ip6, *ip6_dst;
727 prox_rte_ether_hdr *hdr;
728 struct ether_hdr_arp *hdr_arp;
729 struct l3_base *l3 = &tbase->l3;
730 uint64_t tsc= rte_rdtsc();
731 uint64_t reachable_timeout = l3->reachable_timeout * hz / 1000;
733 prox_rte_ipv4_hdr *pip;
734 prox_rte_udp_hdr *udp_hdr;
735 uint8_t port = tbase->l3.reachable_port_id;
737 for (j = 0; j < n_pkts; ++j) {
740 for (j = 0; j < n_pkts; ++j) {
741 PREFETCH0(rte_pktmbuf_mtod(mbufs[j], void *));
744 for (j = 0; j < n_pkts; ++j) {
747 out[0] = OUT_HANDLED;
748 command = get_command(mbufs[j]);
749 plogx_dbg("\tReceived %s mbuf %p\n", actions_string[command], mbufs[j]);
751 case ROUTE_ADD_FROM_MASTER:
752 ip = ctrl_ring_get_ip(mbufs[j]);
753 gateway_ip = ctrl_ring_get_gateway_ip(mbufs[j]);
754 prefix = ctrl_ring_get_prefix(mbufs[j]);
755 gateway_index = get_nh_index(tbase, gateway_ip);
756 if (gateway_index >= MAX_HOP_INDEX) {
757 plog_err("Unable to find or define gateway index - too many\n");
760 modified_route = rte_lpm_is_rule_present(tbase->l3.ipv4_lpm, rte_bswap32(ip), prefix, &nh);
761 ret = rte_lpm_add(tbase->l3.ipv4_lpm, rte_bswap32(ip), prefix, gateway_index);
763 plog_err("Failed to add route to "IPv4_BYTES_FMT"/%d using "IPv4_BYTES_FMT"(index = %d)\n", IP4(ip), prefix, IP4(gateway_ip), gateway_index);
764 } else if (modified_route)
765 plogx_dbg("Modified route to "IPv4_BYTES_FMT"/%d using "IPv4_BYTES_FMT"(index = %d) (was using "IPv4_BYTES_FMT"(index = %d)\n", IP4(ip), prefix, IP4(gateway_ip), gateway_index, IP4(tbase->l3.next_hops[nh].ip), nh);
767 plogx_dbg("Added new route to "IPv4_BYTES_FMT"/%d using "IPv4_BYTES_FMT"(index = %d)\n", IP4(ip), prefix, IP4(gateway_ip), gateway_index);
771 case ROUTE_DEL_FROM_MASTER:
772 ip = ctrl_ring_get_ip(mbufs[j]);
773 prefix = ctrl_ring_get_prefix(mbufs[j]);
775 ret = rte_lpm_is_rule_present(tbase->l3.ipv4_lpm, rte_bswap32(ip), prefix, &nh);
777 ret = rte_lpm_delete(tbase->l3.ipv4_lpm, rte_bswap32(ip), prefix);
779 plog_err("Failed to add rule\n");
781 plog_info("Deleting route to "IPv4_BYTES_FMT"/%d\n", IP4(ip), prefix);
785 case MAC_INFO_FROM_MASTER:
786 hdr_arp = rte_pktmbuf_mtod(mbufs[j], struct ether_hdr_arp *);
787 ip = get_ip(mbufs[j]);
789 if (prox_rte_is_zero_ether_addr(&hdr_arp->arp.data.sha)) {
790 // MAC timeout or deleted from kernel table => reset update_time
791 // This will cause us to send new ARP request
792 // However, as reachable_timeout not touched, we should continue sending our regular IP packets
793 reset_arp_ndp_retransmit_timeout(l3, ip);
796 plogx_dbg("\tUpdating MAC entry for IP "IPv4_BYTES_FMT" with MAC "MAC_BYTES_FMT"\n",
797 IP4(ip), MAC_BYTES(hdr_arp->arp.data.sha.addr_bytes));
801 struct arp_table *entry;
802 ret = rte_hash_add_key(l3->ip_hash, (const void *)&ip);
804 plogx_info("Unable add ip "IPv4_BYTES_FMT" in mac_hash\n", IP4(ip));
805 } else if ((nh = l3->arp_table[ret].nh) != MAX_HOP_INDEX) {
806 entry = &l3->next_hops[nh];
807 memcpy(&entry->mac, &(hdr_arp->arp.data.sha), sizeof(prox_rte_ether_addr));
808 entry->reachable_timeout = tsc + reachable_timeout;
809 update_arp_ndp_retransmit_timeout(l3, &entry->arp_ndp_retransmit_timeout, l3->arp_ndp_retransmit_timeout);
811 memcpy(&l3->arp_table[ret].mac, &(hdr_arp->arp.data.sha), sizeof(prox_rte_ether_addr));
812 l3->arp_table[ret].reachable_timeout = tsc + reachable_timeout;
813 update_arp_ndp_retransmit_timeout(l3, &l3->arp_table[ret].arp_ndp_retransmit_timeout, l3->arp_ndp_retransmit_timeout);
816 else if (ip == l3->gw.ip) {
817 // MAC address of the gateway
818 memcpy(&l3->gw.mac, &hdr_arp->arp.data.sha, 6);
819 l3->flags |= FLAG_DST_MAC_KNOWN;
820 l3->gw.reachable_timeout = tsc + reachable_timeout;
821 update_arp_ndp_retransmit_timeout(l3, &l3->gw.arp_ndp_retransmit_timeout, l3->arp_ndp_retransmit_timeout);
822 } else if (l3->n_pkts < 4) {
823 // Few packets tracked - should be faster to loop through them thean using a hash table
824 for (idx = 0; idx < l3->n_pkts; idx++) {
825 ip_dst = l3->optimized_arp_table[idx].ip;
829 if (idx < l3->n_pkts) {
830 memcpy(&l3->optimized_arp_table[idx].mac, &(hdr_arp->arp.data.sha), sizeof(prox_rte_ether_addr));
831 l3->optimized_arp_table[idx].reachable_timeout = tsc + reachable_timeout;
832 update_arp_ndp_retransmit_timeout(l3, &l3->optimized_arp_table[idx].arp_ndp_retransmit_timeout, l3->arp_ndp_retransmit_timeout);
835 ret = rte_hash_add_key(l3->ip_hash, (const void *)&ip);
837 plogx_info("Unable add ip "IPv4_BYTES_FMT" in mac_hash\n", IP4(ip));
839 memcpy(&l3->arp_table[ret].mac, &(hdr_arp->arp.data.sha), sizeof(prox_rte_ether_addr));
840 l3->arp_table[ret].reachable_timeout = tsc + reachable_timeout;
841 update_arp_ndp_retransmit_timeout(l3, &l3->arp_table[ret].arp_ndp_retransmit_timeout, l3->arp_ndp_retransmit_timeout);
846 case MAC_INFO_FROM_MASTER_FOR_IPV6:
847 ip6 = ctrl_ring_get_ipv6_addr(mbufs[j]);
848 uint64_t data = ctrl_ring_get_data(mbufs[j]);
850 if (l3->n_pkts < 4) {
851 // Few packets tracked - should be faster to loop through them thean using a hash table
852 for (idx = 0; idx < l3->n_pkts; idx++) {
853 ip6_dst = &l3->optimized_arp_table[idx].ip6;
854 if (memcmp(ip6_dst, ip6, sizeof(struct ipv6_addr)) == 0)
857 if (idx < l3->n_pkts) {
858 // IP found; this is a reply for one of our requests!
859 memcpy(&l3->optimized_arp_table[idx].mac, &data, sizeof(prox_rte_ether_addr));
860 l3->optimized_arp_table[idx].reachable_timeout = tsc + l3->reachable_timeout * hz / 1000;
863 int ret = rte_hash_add_key(l3->ip6_hash, (const void *)ip6);
865 plogx_info("Unable add ip "IPv6_BYTES_FMT" in mac_hash\n", IPv6_BYTES(ip6->bytes));
867 memcpy(&l3->arp_table[ret].mac, &data, sizeof(prox_rte_ether_addr));
868 l3->arp_table[ret].reachable_timeout = tsc + l3->reachable_timeout * hz / 1000;
873 case SEND_NDP_FROM_MASTER:
874 case SEND_ARP_REQUEST_FROM_MASTER:
875 case SEND_ARP_REPLY_FROM_MASTER:
877 // tx_ctrlplane_pkt does not drop packets
878 plogx_dbg("\tForwarding (ARP) packet from master\n");
879 tbase->aux->tx_ctrlplane_pkt(tbase, &mbufs[j], 1, out);
880 TASK_STATS_ADD_TX_NON_DP(&tbase->aux->stats, 1);
882 case SEND_ICMP_FROM_MASTER:
884 // tx_ctrlplane_pkt does not drop packets
885 plogx_dbg("\tForwarding (PING) packet from master\n");
886 tbase->aux->tx_ctrlplane_pkt(tbase, &mbufs[j], 1, out);
887 TASK_STATS_ADD_TX_NON_DP(&tbase->aux->stats, 1);
890 // Drop Pseudo packets sent to generate ARP requests
891 // There are other IPv4 packets sent from TAP which we cannot delete e.g. BGP packets
893 hdr = rte_pktmbuf_mtod(mbufs[j], prox_rte_ether_hdr *);
894 if (hdr->ether_type == ETYPE_IPv4) {
895 pip = (prox_rte_ipv4_hdr *)(hdr + 1);
896 } else if (hdr->ether_type == ETYPE_VLAN) {
897 prox_rte_vlan_hdr *vlan = (prox_rte_vlan_hdr *)(hdr + 1);
898 vlan = (prox_rte_vlan_hdr *)(hdr + 1);
899 if (vlan->eth_proto == ETYPE_IPv4) {
900 pip = (prox_rte_ipv4_hdr *)(vlan + 1);
903 if (pip && (pip->next_proto_id == IPPROTO_UDP)) {
904 udp_hdr = (prox_rte_udp_hdr *)(pip + 1);
905 if ((udp_hdr->dst_port == rte_cpu_to_be_16(PROX_PSEUDO_PKT_PORT)) &&
906 (udp_hdr->src_port == rte_cpu_to_be_16(PROX_PSEUDO_PKT_PORT)) &&
907 (rte_be_to_cpu_16(udp_hdr->dgram_len) == 8)) {
908 plogx_dbg("Dropping PROX packet\n");
914 uint16_t src_port = 0, dst_port = 0, len = 0;
916 src_port = udp_hdr->src_port;
917 dst_port = udp_hdr->dst_port;
918 len = rte_be_to_cpu_16(udp_hdr->dgram_len);
920 plogx_dbg("tForwarding TAP packet from master. Type = %x, pip=%p, udp = %p, udp = {src = %x, dst = %x, len = %d}\n", hdr->ether_type, pip, udp_hdr, src_port, dst_port,len );
922 // tx_ctrlplane_pkt does not drop packets
923 tbase->aux->tx_ctrlplane_pkt(tbase, &mbufs[j], 1, out);
924 TASK_STATS_ADD_TX_NON_DP(&tbase->aux->stats, 1);
926 case IPV6_INFO_FROM_MASTER:
927 // addr = ctrl_ring_get_data(mbufs[j]);
928 ip6 = ctrl_ring_get_ipv6_addr(mbufs[j]);
929 if (memcmp(&l3->global_ipv6 , &null_addr, 16) == 0) {
930 memcpy(&l3->global_ipv6, ip6, sizeof(struct ipv6_addr));
931 plog_info("Core %d task %d received global IP "IPv6_BYTES_FMT"\n", l3->core_id, l3->task_id, IPv6_BYTES(ip6->bytes));
932 } else if (memcmp(&l3->global_ipv6, ip6, 8) == 0) {
933 if (l3->prefix_printed == 0) {
934 plog_info("Core %d task %d received expected prefix "IPv6_PREFIX_FMT"\n", l3->core_id, l3->task_id, IPv6_PREFIX(ip6->bytes));
935 l3->prefix_printed = 1;
938 plog_warn("Core %d task %d received unexpected prefix "IPv6_PREFIX_FMT", IP = "IPv6_PREFIX_FMT"\n", l3->core_id, l3->task_id, IPv6_PREFIX(ip6->bytes), IPv6_PREFIX(l3->global_ipv6.bytes));
943 plog_err("Unexpected message received: %d\n", command);