2 // Copyright (c) 2010-2020 Intel Corporation
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
17 #include <rte_lcore.h>
19 #include <rte_hash_crc.h>
22 #include "task_base.h"
27 #include "handle_master.h"
28 #include "prox_port_cfg.h"
29 #include "packet_utils.h"
30 #include "prox_shared.h"
32 #include "hash_entry_types.h"
33 #include "prox_compat.h"
36 #include "prox_ipv6.h"
39 static inline int find_ip(struct ether_hdr_arp *pkt, uint16_t len, uint32_t *ip_dst, uint16_t *vlan)
41 prox_rte_vlan_hdr *vlan_hdr;
42 prox_rte_ether_hdr *eth_hdr = (prox_rte_ether_hdr*)pkt;
43 prox_rte_ipv4_hdr *ip;
44 uint16_t ether_type = eth_hdr->ether_type;
45 uint16_t l2_len = sizeof(prox_rte_ether_hdr);
49 while (((ether_type == ETYPE_VLAN) || (ether_type == ETYPE_8021ad)) && (l2_len + sizeof(prox_rte_vlan_hdr) < len)) {
50 vlan_hdr = (prox_rte_vlan_hdr *)((uint8_t *)pkt + l2_len);
52 ether_type = vlan_hdr->eth_proto;
53 *vlan = rte_be_to_cpu_16(vlan_hdr->vlan_tci & 0xFF0F); // Store VLAN, or CVLAN if QinQ
59 // In case of MPLS, next hop MAC is based on MPLS, not destination IP
71 plog_warn("Unsupported packet type %x - CRC might be wrong\n", ether_type);
75 if (l2_len && (l2_len + sizeof(prox_rte_ipv4_hdr) <= len)) {
76 prox_rte_ipv4_hdr *ip = (prox_rte_ipv4_hdr *)((uint8_t *)pkt + l2_len);
77 // TODO: implement LPM => replace ip_dst by next hop IP DST
78 *ip_dst = ip->dst_addr;
84 static inline void find_vlan(struct ether_hdr_arp *pkt, uint16_t len, uint16_t *vlan)
86 prox_rte_vlan_hdr *vlan_hdr;
87 prox_rte_ether_hdr *eth_hdr = (prox_rte_ether_hdr*)pkt;
88 uint16_t ether_type = eth_hdr->ether_type;
89 uint16_t l2_len = sizeof(prox_rte_ether_hdr);
93 while (((ether_type == ETYPE_8021ad) || (ether_type == ETYPE_VLAN)) && (l2_len + sizeof(prox_rte_vlan_hdr) < len)) {
94 vlan_hdr = (prox_rte_vlan_hdr *)((uint8_t *)pkt + l2_len);
96 ether_type = vlan_hdr->eth_proto;
97 *vlan = rte_be_to_cpu_16(vlan_hdr->vlan_tci & 0xFF0F); // Store VLAN, or CVLAN if QinQ
101 static inline struct ipv6_addr *find_ip6(prox_rte_ether_hdr *pkt, uint16_t len, struct ipv6_addr *ip_dst, uint16_t *vlan)
103 uint16_t ether_type = pkt->ether_type;
104 uint16_t l2_len = sizeof(prox_rte_ether_hdr);
107 if ((ether_type == ETYPE_VLAN) || (ether_type == ETYPE_8021ad)) {
108 prox_rte_vlan_hdr *vlan_hdr = (prox_rte_vlan_hdr *)((uint8_t *)pkt + l2_len);
109 ether_type = vlan_hdr->eth_proto;
111 *vlan = rte_be_to_cpu_16(vlan_hdr->vlan_tci & 0xFF0F);
112 if (ether_type == ETYPE_VLAN) {
113 vlan_hdr = (prox_rte_vlan_hdr *)(vlan_hdr + 1);
114 ether_type = vlan_hdr->eth_proto;
116 *vlan = rte_be_to_cpu_16(vlan_hdr->vlan_tci & 0xFF0F);
119 if ((ether_type == ETYPE_IPv6) && (l2_len + sizeof(prox_rte_ipv6_hdr) <= len)) {
120 prox_rte_ipv6_hdr *ip = (prox_rte_ipv6_hdr *)((uint8_t *)pkt + l2_len);
121 // TODO: implement LPM => replace ip_dst by next hop IP DST
122 memcpy(ip_dst, &ip->dst_addr, sizeof(struct ipv6_addr));
123 return (struct ipv6_addr *)&ip->src_addr;
128 void send_unsollicited_neighbour_advertisement(struct task_base *tbase)
131 uint8_t out = 0, port_id = tbase->l3.reachable_port_id;
132 struct rte_mbuf *mbuf = NULL;
134 if (*(__int128 *)(&tbase->l3.local_ipv6) != 0) {
135 ret = rte_mempool_get(tbase->l3.arp_nd_pool, (void **)&mbuf);
136 if (likely(ret == 0)) {
137 mbuf->port = port_id;
138 build_neighbour_advertisement(tbase->l3.tmaster, mbuf, &prox_port_cfg[port_id].eth_addr, &tbase->l3.local_ipv6, PROX_UNSOLLICITED, prox_port_cfg[port_id].vlan_tags[0]);
139 tbase->aux->tx_ctrlplane_pkt(tbase, &mbuf, 1, &out);
140 TASK_STATS_ADD_TX_NON_DP(&tbase->aux->stats, 1);
142 plog_err("Failed to get a mbuf from arp/ndp mempool\n");
146 if (*(__int128 *)(&tbase->l3.global_ipv6) != 0) {
147 ret = rte_mempool_get(tbase->l3.arp_nd_pool, (void **)&mbuf);
148 if (likely(ret == 0)) {
149 mbuf->port = port_id;
150 build_neighbour_advertisement(tbase->l3.tmaster, mbuf, &prox_port_cfg[port_id].eth_addr, &tbase->l3.global_ipv6, PROX_UNSOLLICITED, prox_port_cfg[port_id].vlan_tags[0]);
151 tbase->aux->tx_ctrlplane_pkt(tbase, &mbuf, 1, &out);
152 TASK_STATS_ADD_TX_NON_DP(&tbase->aux->stats, 1);
154 plog_err("Failed to get a mbuf from arp/ndp mempool\n");
159 plog_err("No neighbor advertisement sent as no local or global ipv6\n");
163 static void send_router_sollicitation(struct task_base *tbase, struct task_args *targ)
166 uint8_t out = 0, port_id = tbase->l3.reachable_port_id;
167 struct rte_mbuf *mbuf;
169 ret = rte_mempool_get(tbase->l3.arp_nd_pool, (void **)&mbuf);
170 if (likely(ret == 0)) {
171 mbuf->port = port_id;
172 build_router_sollicitation(mbuf, &prox_port_cfg[port_id].eth_addr, &targ->local_ipv6, prox_port_cfg[port_id].vlan_tags[0]);
173 tbase->aux->tx_ctrlplane_pkt(tbase, &mbuf, 1, &out);
174 TASK_STATS_ADD_TX_NON_DP(&tbase->aux->stats, 1);
176 plog_err("Failed to get a mbuf from arp/ndp mempool\n");
180 /* This implementation could be improved: instead of checking each time we send a packet whether we need also
181 to send an ARP, we should only check whether the MAC is valid.
182 We should check arp_ndp_retransmit_timeout in the master process. This would also require the generating task to clear its arp ring
183 to avoid sending many ARP while starting after a long stop.
184 We could also check for reachable_timeout in the master so that dataplane has only to check whether MAC is available
185 but this would require either thread safety, or the the exchange of information between master and generating core.
188 static inline int add_key_and_send_arp(struct rte_hash *ip_hash, uint32_t *ip_dst, struct arp_table *entries, uint64_t tsc, uint64_t hz, uint32_t arp_ndp_retransmit_timeout, prox_next_hop_index_type nh, uint64_t **time)
190 int ret = rte_hash_add_key(ip_hash, (const void *)ip_dst);
191 if (unlikely(ret < 0)) {
192 // No reason to send ARP, as reply would be anyhow ignored
193 plogx_err("Unable to add ip "IPv4_BYTES_FMT" in mac_hash\n", IP4(*ip_dst));
196 entries[ret].ip = *ip_dst;
197 entries[ret].nh = nh;
198 *time = &entries[ret].arp_ndp_retransmit_timeout;
203 static inline int update_mac_and_send_mbuf(struct arp_table *entry, prox_rte_ether_addr *mac, uint64_t tsc, uint64_t hz, uint32_t arp_ndp_retransmit_timeout, uint64_t **time)
205 if (likely((tsc < entry->arp_ndp_retransmit_timeout) && (tsc < entry->reachable_timeout))) {
206 memcpy(mac, &entry->mac, sizeof(prox_rte_ether_addr));
208 } else if (tsc > entry->arp_ndp_retransmit_timeout) {
209 // long time since we have sent an arp, send arp
210 *time = &entry->arp_ndp_retransmit_timeout;
211 if (tsc < entry->reachable_timeout){
212 // MAC is valid in the table => send also the mbuf
213 memcpy(mac, &entry->mac, sizeof(prox_rte_ether_addr));
214 return SEND_MBUF_AND_ARP_ND;
216 // MAC still unknown, or timed out => only send ARP
220 // MAC is unknown and we already sent an ARP recently, drop mbuf and wait for ARP reply
224 int write_dst_mac(struct task_base *tbase, struct rte_mbuf *mbuf, uint32_t *ip_dst, uint16_t *vlan, uint64_t **time, uint64_t tsc)
226 const uint64_t hz = rte_get_tsc_hz();
227 struct ether_hdr_arp *packet = rte_pktmbuf_mtod(mbuf, struct ether_hdr_arp *);
228 prox_rte_ether_addr *mac = &packet->ether_hdr.d_addr;
229 prox_next_hop_index_type next_hop_index;
230 static uint64_t last_tsc = 0, n_no_route = 0;
232 struct l3_base *l3 = &(tbase->l3);
234 // First find the next hop
236 // A routing table was configured
237 // If a gw (gateway_ipv4) is also specified, it is used as default gw only i.e. lowest priority (shortest prefix)
238 // This is implemented automatically through lpm
239 uint16_t len = rte_pktmbuf_pkt_len(mbuf);
240 if (find_ip(packet, len, ip_dst, vlan) != 0) {
241 // Unable to find IP address => non IP packet => send it as it
244 if (unlikely(rte_lpm_lookup(l3->ipv4_lpm, rte_bswap32(*ip_dst), &next_hop_index) != 0)) {
245 // Prevent printing too many messages
247 if (tsc > last_tsc + rte_get_tsc_hz()) {
248 plogx_err("No route to IP "IPv4_BYTES_FMT" (%ld times)\n", IP4(*ip_dst), n_no_route);
254 struct arp_table *entry = &l3->next_hops[next_hop_index];
258 return update_mac_and_send_mbuf(entry, mac, tsc, hz, l3->arp_ndp_retransmit_timeout, time);
261 // no next ip: this is a local route
262 // Find IP in lookup table. Send ARP if not found
263 int ret = rte_hash_lookup(l3->ip_hash, (const void *)ip_dst);
264 if (unlikely(ret < 0)) {
265 // IP not found, try to send an ARP
266 return add_key_and_send_arp(l3->ip_hash, ip_dst, l3->arp_table, tsc, hz, l3->arp_ndp_retransmit_timeout, MAX_HOP_INDEX, time);
268 return update_mac_and_send_mbuf(&l3->arp_table[ret], mac, tsc, hz, l3->arp_ndp_retransmit_timeout, time);
272 // No Routing table specified: only a local ip and maybe a gateway
273 // Old default behavior: if a gw is specified, ALL packets go to this gateway (even those we could send w/o the gw
275 uint16_t len = rte_pktmbuf_pkt_len(mbuf);
277 find_vlan(packet, len, vlan);
278 if (likely((l3->flags & FLAG_DST_MAC_KNOWN) && (tsc < l3->gw.arp_ndp_retransmit_timeout) && (tsc < l3->gw.reachable_timeout))) {
279 memcpy(mac, &l3->gw.mac, sizeof(prox_rte_ether_addr));
281 } else if (tsc > l3->gw.arp_ndp_retransmit_timeout) {
282 // long time since we have successfully sent an arp, send arp
283 // If sending ARP failed (ring full) then arp_ndp_retransmit_timeout is not updated to avoid having to wait 1 sec to send ARP REQ again
284 *time = &l3->gw.arp_ndp_retransmit_timeout;
285 l3->gw.arp_ndp_retransmit_timeout = tsc + l3->arp_ndp_retransmit_timeout * hz / 1000;
288 if ((l3->flags & FLAG_DST_MAC_KNOWN) && (tsc < l3->gw.reachable_timeout)){
289 // MAC is valid in the table => send also the mbuf
290 memcpy(mac, &l3->gw.mac, sizeof(prox_rte_ether_addr));
291 return SEND_MBUF_AND_ARP_ND;
293 // MAC still unknown, or timed out => only send ARP
297 // MAC is unknown and we already sent an ARP recently, drop mbuf and wait for ARP reply
302 if (find_ip(packet, len, ip_dst, vlan) != 0) {
303 // Unable to find IP address => non IP packet => send it as it
306 if (likely(l3->n_pkts < 4)) {
307 for (unsigned int idx = 0; idx < l3->n_pkts; idx++) {
308 if (*ip_dst == l3->optimized_arp_table[idx].ip) {
309 return update_mac_and_send_mbuf(&l3->optimized_arp_table[idx], mac, tsc, hz, l3->arp_ndp_retransmit_timeout, time);
312 // IP address not found in table
313 l3->optimized_arp_table[l3->n_pkts].ip = *ip_dst;
314 *time = &l3->optimized_arp_table[l3->n_pkts].arp_ndp_retransmit_timeout;
317 if (l3->n_pkts < 4) {
321 // We have too many IP addresses to search linearly; lets use hash table instead => copy all entries in hash table
322 for (uint32_t idx = 0; idx < l3->n_pkts; idx++) {
323 uint32_t ip = l3->optimized_arp_table[idx].ip;
324 int ret = rte_hash_add_key(l3->ip_hash, (const void *)&ip);
326 // This should not happen as few entries so far.
327 // If it happens, we still send the ARP as easier:
328 // If the ARP corresponds to this error, the ARP reply will be ignored
329 // If ARP does not correspond to this error/ip, then ARP reply will be handled.
330 plogx_err("Unable add ip "IPv4_BYTES_FMT" in mac_hash (already %d entries)\n", IP4(ip), idx);
332 memcpy(&l3->arp_table[ret], &l3->optimized_arp_table[idx], sizeof(struct arp_table));
337 // Find IP in lookup table. Send ARP if not found
338 int ret = rte_hash_lookup(l3->ip_hash, (const void *)ip_dst);
339 if (unlikely(ret < 0)) {
340 // IP not found, try to send an ARP
341 return add_key_and_send_arp(l3->ip_hash, ip_dst, &l3->arp_table[ret], tsc, hz, l3->arp_ndp_retransmit_timeout, MAX_HOP_INDEX, time);
344 return update_mac_and_send_mbuf(&l3->arp_table[ret], mac, tsc, hz, l3->arp_ndp_retransmit_timeout, time);
351 int write_ip6_dst_mac(struct task_base *tbase, struct rte_mbuf *mbuf, struct ipv6_addr *ip_dst, uint16_t *vlan, uint64_t tsc)
353 const uint64_t hz = rte_get_tsc_hz();
354 prox_rte_ether_hdr *packet = rte_pktmbuf_mtod(mbuf, prox_rte_ether_hdr *);
355 prox_rte_ether_addr *mac = &packet->d_addr;
356 struct ipv6_addr *used_ip_src;
358 uint16_t len = rte_pktmbuf_pkt_len(mbuf);
360 struct ipv6_addr *pkt_src_ip6;
361 if ((pkt_src_ip6 = find_ip6(packet, len, ip_dst, vlan)) == NULL) {
362 // Unable to find IP address => non IP packet => send it as it
365 struct l3_base *l3 = &(tbase->l3);
367 // Configure source IP
368 if (*(uint64_t *)(&l3->local_ipv6) == *(uint64_t *)ip_dst) {
369 // Same prefix as local -> use local
370 used_ip_src = &l3->local_ipv6;
371 } else if (*(uint64_t *)(&l3->global_ipv6) == *(uint64_t *)ip_dst) {
372 // Same prefix as global -> use global
373 used_ip_src = &l3->global_ipv6;
374 } else if (*(__int128 *)(&l3->gw.ip6) != 0) {
375 used_ip_src = &l3->global_ipv6;
376 memcpy(ip_dst, &l3->gw.ip6, sizeof(struct ipv6_addr));
377 } else if (*(__int128 *)(&l3->global_ipv6) != 0) {
378 // Global IP is defined -> use it
379 used_ip_src = &l3->global_ipv6;
381 plog_info("Error as trying to send a packet to "IPv6_BYTES_FMT" using "IPv6_BYTES_FMT" (local)\n", IPv6_BYTES(ip_dst->bytes), IPv6_BYTES(l3->local_ipv6.bytes));
384 rte_memcpy(pkt_src_ip6, used_ip_src, sizeof(struct ipv6_addr));
387 if (likely(l3->n_pkts < 4)) {
388 for (unsigned int idx = 0; idx < l3->n_pkts; idx++) {
389 if (*(__int128 *)ip_dst == *(__int128 *)(&l3->optimized_arp_table[idx].ip6)) {
390 // IP address already in table
391 if ((tsc < l3->optimized_arp_table[idx].arp_ndp_retransmit_timeout) && (tsc < l3->optimized_arp_table[idx].reachable_timeout)) {
392 // MAC address was recently updated in table, use it
393 // plog_dbg("Valid MAC address found => send packet\n");
394 rte_memcpy(mac, &l3->optimized_arp_table[idx].mac, sizeof(prox_rte_ether_addr));
396 } else if (tsc > l3->optimized_arp_table[idx].arp_ndp_retransmit_timeout) {
397 // NDP not sent since a long time, send NDP
398 l3->optimized_arp_table[idx].arp_ndp_retransmit_timeout = tsc + l3->arp_ndp_retransmit_timeout * hz / 1000;
399 if (tsc < l3->optimized_arp_table[idx].reachable_timeout) {
400 // MAC still valid => also send mbuf
401 plog_dbg("Valid MAC found but NDP retransmit timeout => send packet and NDP\n");
402 memcpy(mac, &l3->optimized_arp_table[idx].mac, sizeof(prox_rte_ether_addr));
403 return SEND_MBUF_AND_ARP_ND;
405 plog_dbg("Unknown MAC => send NDP but cannot send packet\n");
406 // MAC unvalid => only send NDP
410 // NDP timeout elapsed, MAC not valid anymore but waiting for NDP reply
411 // plog_dbg("NDP reachable timeout elapsed - waiting for NDP reply\n");
416 // IP address not found in table
417 memcpy(&l3->optimized_arp_table[l3->n_pkts].ip6, ip_dst, sizeof(struct ipv6_addr));
418 l3->optimized_arp_table[l3->n_pkts].arp_ndp_retransmit_timeout = tsc + l3->arp_ndp_retransmit_timeout * hz / 1000;
421 if (l3->n_pkts < 4) {
425 // We have too many IP addresses to search linearly; lets use hash table instead => copy all entries in hash table
426 for (uint32_t idx = 0; idx < l3->n_pkts; idx++) {
427 struct ipv6_addr *ip6 = &l3->optimized_arp_table[idx].ip6;
428 int ret = rte_hash_add_key(l3->ip6_hash, (const void *)ip6);
430 // This should not happen as few entries so far.
431 // If it happens, we still send the NDP as easier:
432 // If the NDP corresponds to this error, the NDP reply will be ignored
433 // If NDP does not correspond to this error/ip, then NDP reply will be handled.
434 plogx_err("Unable add ip "IPv6_BYTES_FMT" in mac_hash (already %d entries)\n", IPv6_BYTES(ip6->bytes), idx);
436 memcpy(&l3->arp_table[ret], &l3->optimized_arp_table[idx], sizeof(struct arp_table));
441 // Find IP in lookup table. Send ND if not found
442 int ret = rte_hash_lookup(l3->ip6_hash, (const void *)ip_dst);
443 if (unlikely(ret < 0)) {
444 // IP not found, try to send an ND
445 int ret = rte_hash_add_key(l3->ip6_hash, (const void *)ip_dst);
447 // No reason to send NDP, as reply would be anyhow ignored
448 plogx_err("Unable to add ip "IPv6_BYTES_FMT" in mac_hash\n", IPv6_BYTES(ip_dst->bytes));
451 memcpy(&l3->arp_table[ret].ip6, ip_dst, sizeof(struct ipv6_addr));
452 l3->arp_table[ret].arp_ndp_retransmit_timeout = tsc + l3->arp_ndp_retransmit_timeout * hz / 1000;
457 if (likely((tsc < l3->arp_table[ret].arp_ndp_retransmit_timeout) && (tsc < l3->arp_table[ret].reachable_timeout))) {
458 // MAC still valid and NDP sent recently
459 memcpy(mac, &l3->arp_table[ret].mac, sizeof(prox_rte_ether_addr));
461 } else if (tsc > l3->arp_table[ret].arp_ndp_retransmit_timeout) {
462 // NDP not sent since a long time, send NDP
463 l3->arp_table[ret].arp_ndp_retransmit_timeout = tsc + l3->arp_ndp_retransmit_timeout * hz / 1000;
464 if (tsc < l3->arp_table[ret].reachable_timeout) {
465 // MAC still valid => send also MBUF
466 memcpy(mac, &l3->arp_table[ret].mac, sizeof(prox_rte_ether_addr));
467 return SEND_MBUF_AND_ARP_ND;
480 void task_init_l3(struct task_base *tbase, struct task_args *targ)
482 static char hash_name[30];
483 uint32_t n_entries = MAX_ARP_ENTRIES * 4;
484 const int socket_id = rte_lcore_to_socket_id(targ->lconf->id);
485 sprintf(hash_name, "A%03d_%03d_mac_table", targ->lconf->id, targ->id);
489 struct rte_hash_parameters hash_params = {
491 .entries = n_entries,
492 .key_len = sizeof(uint32_t),
493 .hash_func = rte_hash_crc,
494 .hash_func_init_val = 0,
496 if (targ->flags & TASK_ARG_L3) {
497 plog_info("\t\tInitializing L3 (IPv4)\n");
498 tbase->l3.ip_hash = rte_hash_create(&hash_params);
499 PROX_PANIC(tbase->l3.ip_hash == NULL, "Failed to set up ip hash table\n");
503 if (targ->flags & TASK_ARG_NDP) {
504 plog_info("\t\tInitializing NDP (IPv6)\n");
505 hash_params.key_len = sizeof(struct ipv6_addr);
506 tbase->l3.ip6_hash = rte_hash_create(&hash_params);
507 PROX_PANIC(tbase->l3.ip6_hash == NULL, "Failed to set up ip hash table\n");
509 tbase->l3.arp_table = (struct arp_table *)prox_zmalloc(n_entries * sizeof(struct arp_table), socket_id);
510 PROX_PANIC(tbase->l3.arp_table == NULL, "Failed to allocate memory for %u entries in arp/ndp table\n", n_entries);
511 plog_info("\t\tarp/ndp table, with %d entries of size %ld\n", n_entries, sizeof(struct l3_base));
513 targ->lconf->ctrl_func_p[targ->task] = handle_ctrl_plane_pkts;
514 targ->lconf->ctrl_timeout = freq_to_tsc(targ->ctrl_freq);
515 tbase->l3.gw.ip = rte_cpu_to_be_32(targ->gateway_ipv4);
516 memcpy(&tbase->l3.gw.ip6, &targ->gateway_ipv6, sizeof(struct ipv6_addr));
517 tbase->flags |= TASK_L3;
518 tbase->l3.core_id = targ->lconf->id;
519 tbase->l3.task_id = targ->id;
520 tbase->l3.tmaster = targ->tmaster;
521 tbase->l3.seed = (uint)rte_rdtsc();
522 if (targ->reachable_timeout != 0)
523 tbase->l3.reachable_timeout = targ->reachable_timeout;
525 tbase->l3.reachable_timeout = DEFAULT_ARP_TIMEOUT;
526 if (targ->arp_ndp_retransmit_timeout != 0)
527 tbase->l3.arp_ndp_retransmit_timeout = targ->arp_ndp_retransmit_timeout;
529 tbase->l3.arp_ndp_retransmit_timeout = DEFAULT_ARP_UPDATE_TIME;
532 void task_start_l3(struct task_base *tbase, struct task_args *targ)
534 const int socket_id = rte_lcore_to_socket_id(targ->lconf->id);
535 const int NB_ARP_ND_MBUF = 1024;
536 const int ARP_ND_MBUF_SIZE = 2048;
537 const int NB_CACHE_ARP_ND_MBUF = 256;
539 struct prox_port_cfg *port = find_reachable_port(targ);
540 if (port && (tbase->l3.arp_nd_pool == NULL)) {
541 static char name[] = "arp0_pool";
542 tbase->l3.reachable_port_id = port - prox_port_cfg;
543 if ((targ->local_ipv4 && port->ip_addr[0].ip) && (targ->local_ipv4 != port->ip_addr[0].ip)) {
544 PROX_PANIC(1, "local_ipv4 in core section ("IPv4_BYTES_FMT") differs from port section ("IPv4_BYTES_FMT")\n", IP4(rte_be_to_cpu_32(targ->local_ipv4)), IP4(rte_be_to_cpu_32(port->ip_addr[0].ip)));
546 if ((targ->local_ipv4 && port->ip_addr[0].ip) && (targ->local_prefix != port->ip_addr[0].prefix)) {
547 PROX_PANIC(1, "local_ipv4 prefix in core section (%d) differs from port section (%d)\n", targ->local_prefix, port->ip_addr[0].prefix);
549 if (!port->ip_addr[0].ip && targ->local_ipv4) {
550 port->ip_addr[0].ip = targ->local_ipv4;
551 port->ip_addr[0].prefix = targ->local_prefix;
553 port->vlan_tags[0] = 0;
554 plog_info("Setting port local_ipv4 from core %d local_ipv4 to "IPv4_BYTES_FMT"\n", tbase->l3.reachable_port_id, IP4(rte_be_to_cpu_32(port->ip_addr[0].ip)));
556 for (int vlan_id = 0; vlan_id < port->n_vlans; vlan_id++) {
557 if (port->ip_addr[vlan_id].ip)
558 register_ip_to_ctrl_plane(tbase->l3.tmaster, rte_be_to_cpu_32(port->ip_addr[vlan_id].ip), tbase->l3.reachable_port_id, targ->lconf->id, targ->id);
560 if (strcmp(targ->route_table, "") != 0) {
564 PROX_PANIC(port->n_vlans == 0, "missing local_ipv4 while route table is specified in L3 mode\n");
566 // LPM might be modified runtime => do not share with other cores
567 ret = lua_to_lpm4(prox_lua(), GLOBAL, targ->route_table, socket_id, &lpm);
568 PROX_PANIC(ret, "Failed to load IPv4 LPM:\n%s\n", get_lua_to_errors());
570 tbase->l3.ipv4_lpm = lpm->rte_lpm;
571 tbase->l3.next_hops = prox_zmalloc(sizeof(*tbase->l3.next_hops) * MAX_HOP_INDEX, socket_id);
572 PROX_PANIC(tbase->l3.next_hops == NULL, "Could not allocate memory for next hop\n");
574 for (uint32_t i = 0; i < MAX_HOP_INDEX; i++) {
575 if (!lpm->next_hops[i].ip_dst)
578 tbase->l3.next_hops[i].ip = rte_bswap32(lpm->next_hops[i].ip_dst);
579 int tx_port = lpm->next_hops[i].mac_port.out_idx;
580 // gen only supports one port right now .... hence port = 0
581 if ((tx_port > targ->nb_txports - 1) && (tx_port > targ->nb_txrings - 1)) {
582 PROX_PANIC(1, "Routing Table contains port %d but only %d tx port/ %d ring:\n", tx_port, targ->nb_txports, targ->nb_txrings);
585 plog_info("Using routing table %s in l3 mode, with %d gateways\n", targ->route_table, tbase->l3.nb_gws);
587 // Last but one (x n_vlans) "next_hop_index" is not a gateway but direct routes
588 for (int vlan_id = 0; vlan_id < port->n_vlans; vlan_id++) {
589 tbase->l3.next_hops[tbase->l3.nb_gws].ip = 0;
590 ret = rte_lpm_add(tbase->l3.ipv4_lpm, port->ip_addr[vlan_id].ip, port->ip_addr[vlan_id].prefix, tbase->l3.nb_gws++);
591 PROX_PANIC(ret, "Failed to add local_ipv4 "IPv4_BYTES_FMT"/%d to lpm\n", IP4(port->ip_addr[vlan_id].ip), port->ip_addr[vlan_id].prefix);
594 // Last "next_hop_index" is default gw
595 tbase->l3.next_hops[tbase->l3.nb_gws].ip = rte_bswap32(targ->gateway_ipv4);
596 if (targ->gateway_ipv4) {
597 ret = rte_lpm_add(tbase->l3.ipv4_lpm, targ->gateway_ipv4, 0, tbase->l3.nb_gws++);
598 PROX_PANIC(ret, "Failed to add gateway_ipv4 "IPv4_BYTES_FMT"/%d to lpm\n", IP4(tbase->l3.gw.ip), 0);
602 master_init_vdev(tbase->l3.tmaster, tbase->l3.reachable_port_id, targ->lconf->id, targ->id);
604 // Create IPv6 addr if none were configured
605 if (targ->flags & TASK_ARG_NDP) {
606 if (!memcmp(&targ->local_ipv6, &null_addr, sizeof(struct ipv6_addr))) {
607 set_link_local(&targ->local_ipv6);
608 set_EUI(&targ->local_ipv6, &port->eth_addr);
610 plog_info("\tCore %d, task %d, local IPv6 addr is "IPv6_BYTES_FMT" (%s)\n",
611 targ->lconf->id, targ->id,
612 IPv6_BYTES(targ->local_ipv6.bytes),
613 IP6_Canonical(&targ->local_ipv6));
614 memcpy(&tbase->l3.local_ipv6, &targ->local_ipv6, sizeof(struct ipv6_addr));
616 if (memcmp(&targ->global_ipv6, &null_addr, sizeof(struct ipv6_addr))) {
617 memcpy(&tbase->l3.global_ipv6, &targ->global_ipv6, sizeof(struct ipv6_addr));
618 plog_info("\tCore %d, task %d, global IPv6 addr is "IPv6_BYTES_FMT" (%s)\n",
619 targ->lconf->id, targ->id,
620 IPv6_BYTES(targ->global_ipv6.bytes),
621 IP6_Canonical(&targ->global_ipv6));
623 if (targ->ipv6_router)
624 register_router_to_ctrl_plane(tbase->l3.tmaster, tbase->l3.reachable_port_id, targ->lconf->id, targ->id, &targ->local_ipv6, &targ->global_ipv6, &targ->router_prefix);
626 register_node_to_ctrl_plane(tbase->l3.tmaster, &targ->local_ipv6, &targ->global_ipv6, tbase->l3.reachable_port_id, targ->lconf->id, targ->id);
630 struct rte_mempool *ret = rte_mempool_create(name, NB_ARP_ND_MBUF, ARP_ND_MBUF_SIZE, NB_CACHE_ARP_ND_MBUF,
631 sizeof(struct rte_pktmbuf_pool_private), rte_pktmbuf_pool_init, NULL, rte_pktmbuf_init, 0,
633 PROX_PANIC(ret == NULL, "Failed to allocate ARP/ND memory pool on socket %u with %u elements\n",
634 rte_socket_id(), NB_ARP_ND_MBUF);
635 plog_info("\tMempool %p (%s) size = %u * %u cache %u, socket %d (for ARP/ND)\n", ret, name, NB_ARP_ND_MBUF,
636 ARP_ND_MBUF_SIZE, NB_CACHE_ARP_ND_MBUF, rte_socket_id());
637 tbase->l3.arp_nd_pool = ret;
638 if ((targ->flags & TASK_ARG_NDP) && (!targ->ipv6_router)) {
639 plog_info("Sending Router Sollicitation\n");
640 send_router_sollicitation(tbase, targ);
642 if ((targ->flags & TASK_ARG_NDP) && (targ->flags & TASK_ARG_SEND_NA_AT_STARTUP)) {
643 plog_info("Sending unsollicited Neighbour Advertisement\n");
644 send_unsollicited_neighbour_advertisement(tbase);
650 void task_set_gateway_ip(struct task_base *tbase, uint32_t ip)
652 tbase->l3.gw.ip = ip;
653 tbase->flags &= ~FLAG_DST_MAC_KNOWN;
656 static void reset_arp_ndp_retransmit_timeout(struct l3_base *l3, uint32_t ip)
659 plogx_dbg("MAC entry for IP "IPv4_BYTES_FMT" timeout in kernel\n", IP4(ip));
662 int ret = rte_hash_lookup(l3->ip_hash, (const void *)&ip);
664 l3->arp_table[ret].arp_ndp_retransmit_timeout = 0;
665 } else if (ip == l3->gw.ip) {
666 l3->gw.arp_ndp_retransmit_timeout = 0;
667 } else if (l3->n_pkts < 4) {
668 for (idx = 0; idx < l3->n_pkts; idx++) {
669 uint32_t ip_dst = l3->optimized_arp_table[idx].ip;
673 if (idx < l3->n_pkts) {
674 l3->optimized_arp_table[idx].arp_ndp_retransmit_timeout = 0;
677 int ret = rte_hash_lookup(l3->ip_hash, (const void *)&ip);
679 l3->arp_table[ret].arp_ndp_retransmit_timeout = 0;
684 static prox_next_hop_index_type get_nh_index(struct task_base *tbase, uint32_t gw_ip)
686 // Check if gateway already exists
687 for (prox_next_hop_index_type i = 0; i < tbase->l3.nb_gws; i++) {
688 if (tbase->l3.next_hops[i].ip == gw_ip) {
692 if (tbase->l3.nb_gws < MAX_HOP_INDEX) {
693 tbase->l3.next_hops[tbase->l3.nb_gws].ip = gw_ip;
695 return tbase->l3.nb_gws - 1;
697 return MAX_HOP_INDEX;
699 void handle_ctrl_plane_pkts(struct task_base *tbase, struct rte_mbuf **mbufs, uint16_t n_pkts)
702 const uint64_t hz = rte_get_tsc_hz();
703 uint32_t ip, ip_dst, idx, gateway_ip, prefix;
704 prox_next_hop_index_type gateway_index;
705 int j, ret, modified_route;
707 struct ipv6_addr *ip6, *ip6_dst;
709 prox_rte_ether_hdr *hdr;
710 struct ether_hdr_arp *hdr_arp;
711 struct l3_base *l3 = &tbase->l3;
712 uint64_t tsc= rte_rdtsc();
713 uint64_t reachable_timeout = l3->reachable_timeout * hz / 1000;
715 prox_rte_ipv4_hdr *pip;
716 prox_rte_udp_hdr *udp_hdr;
717 uint8_t port = tbase->l3.reachable_port_id;
719 for (j = 0; j < n_pkts; ++j) {
722 for (j = 0; j < n_pkts; ++j) {
723 PREFETCH0(rte_pktmbuf_mtod(mbufs[j], void *));
726 for (j = 0; j < n_pkts; ++j) {
729 out[0] = OUT_HANDLED;
730 command = get_command(mbufs[j]);
731 plogx_dbg("\tReceived %s mbuf %p\n", actions_string[command], mbufs[j]);
733 case ROUTE_ADD_FROM_MASTER:
734 ip = ctrl_ring_get_ip(mbufs[j]);
735 gateway_ip = ctrl_ring_get_gateway_ip(mbufs[j]);
736 prefix = ctrl_ring_get_prefix(mbufs[j]);
737 gateway_index = get_nh_index(tbase, gateway_ip);
738 if (gateway_index >= MAX_HOP_INDEX) {
739 plog_err("Unable to find or define gateway index - too many\n");
742 modified_route = rte_lpm_is_rule_present(tbase->l3.ipv4_lpm, rte_bswap32(ip), prefix, &nh);
743 ret = rte_lpm_add(tbase->l3.ipv4_lpm, rte_bswap32(ip), prefix, gateway_index);
745 plog_err("Failed to add route to "IPv4_BYTES_FMT"/%d using "IPv4_BYTES_FMT"(index = %d)\n", IP4(ip), prefix, IP4(gateway_ip), gateway_index);
746 } else if (modified_route)
747 plogx_dbg("Modified route to "IPv4_BYTES_FMT"/%d using "IPv4_BYTES_FMT"(index = %d) (was using "IPv4_BYTES_FMT"(index = %d)\n", IP4(ip), prefix, IP4(gateway_ip), gateway_index, IP4(tbase->l3.next_hops[nh].ip), nh);
749 plogx_dbg("Added new route to "IPv4_BYTES_FMT"/%d using "IPv4_BYTES_FMT"(index = %d)\n", IP4(ip), prefix, IP4(gateway_ip), gateway_index);
753 case ROUTE_DEL_FROM_MASTER:
754 ip = ctrl_ring_get_ip(mbufs[j]);
755 prefix = ctrl_ring_get_prefix(mbufs[j]);
757 ret = rte_lpm_is_rule_present(tbase->l3.ipv4_lpm, rte_bswap32(ip), prefix, &nh);
759 ret = rte_lpm_delete(tbase->l3.ipv4_lpm, rte_bswap32(ip), prefix);
761 plog_err("Failed to add rule\n");
763 plog_info("Deleting route to "IPv4_BYTES_FMT"/%d\n", IP4(ip), prefix);
767 case MAC_INFO_FROM_MASTER:
768 hdr_arp = rte_pktmbuf_mtod(mbufs[j], struct ether_hdr_arp *);
769 ip = get_ip(mbufs[j]);
771 if (prox_rte_is_zero_ether_addr(&hdr_arp->arp.data.sha)) {
772 // MAC timeout or deleted from kernel table => reset update_time
773 // This will cause us to send new ARP request
774 // However, as reachable_timeout not touched, we should continue sending our regular IP packets
775 reset_arp_ndp_retransmit_timeout(l3, ip);
778 plogx_dbg("\tUpdating MAC entry for IP "IPv4_BYTES_FMT" with MAC "MAC_BYTES_FMT"\n",
779 IP4(ip), MAC_BYTES(hdr_arp->arp.data.sha.addr_bytes));
783 struct arp_table *entry;
784 ret = rte_hash_add_key(l3->ip_hash, (const void *)&ip);
786 plogx_info("Unable add ip "IPv4_BYTES_FMT" in mac_hash\n", IP4(ip));
787 } else if ((nh = l3->arp_table[ret].nh) != MAX_HOP_INDEX) {
788 entry = &l3->next_hops[nh];
789 memcpy(&entry->mac, &(hdr_arp->arp.data.sha), sizeof(prox_rte_ether_addr));
790 entry->reachable_timeout = tsc + reachable_timeout;
791 update_arp_ndp_retransmit_timeout(l3, &entry->arp_ndp_retransmit_timeout, l3->arp_ndp_retransmit_timeout);
793 memcpy(&l3->arp_table[ret].mac, &(hdr_arp->arp.data.sha), sizeof(prox_rte_ether_addr));
794 l3->arp_table[ret].reachable_timeout = tsc + reachable_timeout;
795 update_arp_ndp_retransmit_timeout(l3, &l3->arp_table[ret].arp_ndp_retransmit_timeout, l3->arp_ndp_retransmit_timeout);
798 else if (ip == l3->gw.ip) {
799 // MAC address of the gateway
800 memcpy(&l3->gw.mac, &hdr_arp->arp.data.sha, 6);
801 l3->flags |= FLAG_DST_MAC_KNOWN;
802 l3->gw.reachable_timeout = tsc + reachable_timeout;
803 update_arp_ndp_retransmit_timeout(l3, &l3->gw.arp_ndp_retransmit_timeout, l3->arp_ndp_retransmit_timeout);
804 } else if (l3->n_pkts < 4) {
805 // Few packets tracked - should be faster to loop through them thean using a hash table
806 for (idx = 0; idx < l3->n_pkts; idx++) {
807 ip_dst = l3->optimized_arp_table[idx].ip;
811 if (idx < l3->n_pkts) {
812 memcpy(&l3->optimized_arp_table[idx].mac, &(hdr_arp->arp.data.sha), sizeof(prox_rte_ether_addr));
813 l3->optimized_arp_table[idx].reachable_timeout = tsc + reachable_timeout;
814 update_arp_ndp_retransmit_timeout(l3, &l3->optimized_arp_table[idx].arp_ndp_retransmit_timeout, l3->arp_ndp_retransmit_timeout);
817 ret = rte_hash_add_key(l3->ip_hash, (const void *)&ip);
819 plogx_info("Unable add ip "IPv4_BYTES_FMT" in mac_hash\n", IP4(ip));
821 memcpy(&l3->arp_table[ret].mac, &(hdr_arp->arp.data.sha), sizeof(prox_rte_ether_addr));
822 l3->arp_table[ret].reachable_timeout = tsc + reachable_timeout;
823 update_arp_ndp_retransmit_timeout(l3, &l3->arp_table[ret].arp_ndp_retransmit_timeout, l3->arp_ndp_retransmit_timeout);
828 case MAC_INFO_FROM_MASTER_FOR_IPV6:
829 ip6 = ctrl_ring_get_ipv6_addr(mbufs[j]);
830 uint64_t data = ctrl_ring_get_data(mbufs[j]);
832 if (l3->n_pkts < 4) {
833 // Few packets tracked - should be faster to loop through them thean using a hash table
834 for (idx = 0; idx < l3->n_pkts; idx++) {
835 ip6_dst = &l3->optimized_arp_table[idx].ip6;
836 if (memcmp(ip6_dst, ip6, sizeof(struct ipv6_addr)) == 0)
839 if (idx < l3->n_pkts) {
840 // IP found; this is a reply for one of our requests!
841 memcpy(&l3->optimized_arp_table[idx].mac, &data, sizeof(prox_rte_ether_addr));
842 l3->optimized_arp_table[idx].reachable_timeout = tsc + l3->reachable_timeout * hz / 1000;
845 int ret = rte_hash_add_key(l3->ip6_hash, (const void *)ip6);
847 plogx_info("Unable add ip "IPv6_BYTES_FMT" in mac_hash\n", IPv6_BYTES(ip6->bytes));
849 memcpy(&l3->arp_table[ret].mac, &data, sizeof(prox_rte_ether_addr));
850 l3->arp_table[ret].reachable_timeout = tsc + l3->reachable_timeout * hz / 1000;
855 case SEND_NDP_FROM_MASTER:
856 case SEND_ARP_REQUEST_FROM_MASTER:
857 case SEND_ARP_REPLY_FROM_MASTER:
859 // tx_ctrlplane_pkt does not drop packets
860 plogx_dbg("\tForwarding (ARP) packet from master\n");
861 tbase->aux->tx_ctrlplane_pkt(tbase, &mbufs[j], 1, out);
862 TASK_STATS_ADD_TX_NON_DP(&tbase->aux->stats, 1);
864 case SEND_ICMP_FROM_MASTER:
866 // tx_ctrlplane_pkt does not drop packets
867 plogx_dbg("\tForwarding (PING) packet from master\n");
868 tbase->aux->tx_ctrlplane_pkt(tbase, &mbufs[j], 1, out);
869 TASK_STATS_ADD_TX_NON_DP(&tbase->aux->stats, 1);
872 // Drop Pseudo packets sent to generate ARP requests
873 // There are other IPv4 packets sent from TAP which we cannot delete e.g. BGP packets
875 hdr = rte_pktmbuf_mtod(mbufs[j], prox_rte_ether_hdr *);
876 if (hdr->ether_type == ETYPE_IPv4) {
877 pip = (prox_rte_ipv4_hdr *)(hdr + 1);
878 } else if (hdr->ether_type == ETYPE_VLAN) {
879 prox_rte_vlan_hdr *vlan = (prox_rte_vlan_hdr *)(hdr + 1);
880 vlan = (prox_rte_vlan_hdr *)(hdr + 1);
881 if (vlan->eth_proto == ETYPE_IPv4) {
882 pip = (prox_rte_ipv4_hdr *)(vlan + 1);
885 if (pip && (pip->next_proto_id == IPPROTO_UDP)) {
886 udp_hdr = (prox_rte_udp_hdr *)(pip + 1);
887 if ((udp_hdr->dst_port == rte_cpu_to_be_16(PROX_PSEUDO_PKT_PORT)) &&
888 (udp_hdr->src_port == rte_cpu_to_be_16(PROX_PSEUDO_PKT_PORT)) &&
889 (rte_be_to_cpu_16(udp_hdr->dgram_len) == 8)) {
890 plogx_dbg("Dropping PROX packet\n");
896 uint16_t src_port = 0, dst_port = 0, len = 0;
898 src_port = udp_hdr->src_port;
899 dst_port = udp_hdr->dst_port;
900 len = rte_be_to_cpu_16(udp_hdr->dgram_len);
902 plogx_dbg("tForwarding TAP packet from master. Type = %x, pip=%p, udp = %p, udp = {src = %x, dst = %x, len = %d}\n", hdr->ether_type, pip, udp_hdr, src_port, dst_port,len );
904 // tx_ctrlplane_pkt does not drop packets
905 tbase->aux->tx_ctrlplane_pkt(tbase, &mbufs[j], 1, out);
906 TASK_STATS_ADD_TX_NON_DP(&tbase->aux->stats, 1);
908 case IPV6_INFO_FROM_MASTER:
909 // addr = ctrl_ring_get_data(mbufs[j]);
910 ip6 = ctrl_ring_get_ipv6_addr(mbufs[j]);
911 if (memcmp(&l3->global_ipv6 , &null_addr, 16) == 0) {
912 memcpy(&l3->global_ipv6, ip6, sizeof(struct ipv6_addr));
913 plog_info("Core %d task %d received global IP "IPv6_BYTES_FMT"\n", l3->core_id, l3->task_id, IPv6_BYTES(ip6->bytes));
914 } else if (memcmp(&l3->global_ipv6, ip6, 8) == 0) {
915 if (l3->prefix_printed == 0) {
916 plog_info("Core %d task %d received expected prefix "IPv6_PREFIX_FMT"\n", l3->core_id, l3->task_id, IPv6_PREFIX(ip6->bytes));
917 l3->prefix_printed = 1;
920 plog_warn("Core %d task %d received unexpected prefix "IPv6_PREFIX_FMT", IP = "IPv6_PREFIX_FMT"\n", l3->core_id, l3->task_id, IPv6_PREFIX(ip6->bytes), IPv6_PREFIX(l3->global_ipv6.bytes));
925 plog_err("Unexpected message received: %d\n", command);