2 // Copyright (c) 2010-2020 Intel Corporation
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
17 #include <rte_lcore.h>
19 #include <rte_hash_crc.h>
22 #include "task_base.h"
27 #include "handle_master.h"
28 #include "prox_port_cfg.h"
29 #include "packet_utils.h"
30 #include "prox_shared.h"
32 #include "hash_entry_types.h"
33 #include "prox_compat.h"
36 #include "prox_ipv6.h"
39 static inline int find_ip(struct ether_hdr_arp *pkt, uint16_t len, uint32_t *ip_dst, uint16_t *vlan)
41 prox_rte_vlan_hdr *vlan_hdr;
42 prox_rte_ether_hdr *eth_hdr = (prox_rte_ether_hdr*)pkt;
43 prox_rte_ipv4_hdr *ip;
44 uint16_t ether_type = eth_hdr->ether_type;
45 uint16_t l2_len = sizeof(prox_rte_ether_hdr);
49 while (((ether_type == ETYPE_VLAN) || (ether_type == ETYPE_8021ad)) && (l2_len + sizeof(prox_rte_vlan_hdr) < len)) {
50 vlan_hdr = (prox_rte_vlan_hdr *)((uint8_t *)pkt + l2_len);
52 ether_type = vlan_hdr->eth_proto;
53 *vlan = rte_be_to_cpu_16(vlan_hdr->vlan_tci & 0xFF0F); // Store VLAN, or CVLAN if QinQ
59 // In case of MPLS, next hop MAC is based on MPLS, not destination IP
71 plog_warn("Unsupported packet type %x - CRC might be wrong\n", ether_type);
75 if (l2_len && (l2_len + sizeof(prox_rte_ipv4_hdr) <= len)) {
76 prox_rte_ipv4_hdr *ip = (prox_rte_ipv4_hdr *)((uint8_t *)pkt + l2_len);
77 // TODO: implement LPM => replace ip_dst by next hop IP DST
78 *ip_dst = ip->dst_addr;
84 static inline void find_vlan(struct ether_hdr_arp *pkt, uint16_t len, uint16_t *vlan)
86 prox_rte_vlan_hdr *vlan_hdr;
87 prox_rte_ether_hdr *eth_hdr = (prox_rte_ether_hdr*)pkt;
88 uint16_t ether_type = eth_hdr->ether_type;
89 uint16_t l2_len = sizeof(prox_rte_ether_hdr);
93 while (((ether_type == ETYPE_8021ad) || (ether_type == ETYPE_VLAN)) && (l2_len + sizeof(prox_rte_vlan_hdr) < len)) {
94 vlan_hdr = (prox_rte_vlan_hdr *)((uint8_t *)pkt + l2_len);
96 ether_type = vlan_hdr->eth_proto;
97 *vlan = rte_be_to_cpu_16(vlan_hdr->vlan_tci & 0xFF0F); // Store VLAN, or CVLAN if QinQ
101 static inline struct ipv6_addr *find_ip6(prox_rte_ether_hdr *pkt, uint16_t len, struct ipv6_addr *ip_dst, uint16_t *vlan)
103 uint16_t ether_type = pkt->ether_type;
104 uint16_t l2_len = sizeof(prox_rte_ether_hdr);
107 if ((ether_type == ETYPE_VLAN) || (ether_type == ETYPE_8021ad)) {
108 prox_rte_vlan_hdr *vlan_hdr = (prox_rte_vlan_hdr *)((uint8_t *)pkt + l2_len);
109 ether_type = vlan_hdr->eth_proto;
111 *vlan = rte_be_to_cpu_16(vlan_hdr->vlan_tci & 0xFF0F);
112 if (ether_type == ETYPE_VLAN) {
113 vlan_hdr = (prox_rte_vlan_hdr *)(vlan_hdr + 1);
114 ether_type = vlan_hdr->eth_proto;
116 *vlan = rte_be_to_cpu_16(vlan_hdr->vlan_tci & 0xFF0F);
119 if ((ether_type == ETYPE_IPv6) && (l2_len + sizeof(prox_rte_ipv6_hdr) <= len)) {
120 prox_rte_ipv6_hdr *ip = (prox_rte_ipv6_hdr *)((uint8_t *)pkt + l2_len);
121 // TODO: implement LPM => replace ip_dst by next hop IP DST
122 memcpy(ip_dst, &ip->dst_addr, sizeof(struct ipv6_addr));
123 return (struct ipv6_addr *)&ip->src_addr;
128 void send_unsollicited_neighbour_advertisement(struct task_base *tbase)
131 uint8_t out = 0, port_id = tbase->l3.reachable_port_id;
132 struct rte_mbuf *mbuf = NULL;
134 if (*(__int128 *)(&tbase->l3.local_ipv6) != 0) {
135 ret = rte_mempool_get(tbase->l3.arp_nd_pool, (void **)&mbuf);
136 if (likely(ret == 0)) {
137 mbuf->port = port_id;
138 build_neighbour_advertisement(tbase->l3.tmaster, mbuf, &prox_port_cfg[port_id].eth_addr, &tbase->l3.local_ipv6, PROX_UNSOLLICITED, prox_port_cfg[port_id].vlan_tags[0]);
139 tbase->aux->tx_ctrlplane_pkt(tbase, &mbuf, 1, &out);
140 TASK_STATS_ADD_TX_NON_DP(&tbase->aux->stats, 1);
142 plog_err("Failed to get a mbuf from arp/ndp mempool\n");
146 if (*(__int128 *)(&tbase->l3.global_ipv6) != 0) {
147 ret = rte_mempool_get(tbase->l3.arp_nd_pool, (void **)&mbuf);
148 if (likely(ret == 0)) {
149 mbuf->port = port_id;
150 build_neighbour_advertisement(tbase->l3.tmaster, mbuf, &prox_port_cfg[port_id].eth_addr, &tbase->l3.global_ipv6, PROX_UNSOLLICITED, prox_port_cfg[port_id].vlan_tags[0]);
151 tbase->aux->tx_ctrlplane_pkt(tbase, &mbuf, 1, &out);
152 TASK_STATS_ADD_TX_NON_DP(&tbase->aux->stats, 1);
154 plog_err("Failed to get a mbuf from arp/ndp mempool\n");
159 plog_err("No neighbor advertisement sent as no local or global ipv6\n");
163 static void send_router_sollicitation(struct task_base *tbase, struct task_args *targ)
166 uint8_t out = 0, port_id = tbase->l3.reachable_port_id;
167 struct rte_mbuf *mbuf;
169 ret = rte_mempool_get(tbase->l3.arp_nd_pool, (void **)&mbuf);
170 if (likely(ret == 0)) {
171 mbuf->port = port_id;
172 build_router_sollicitation(mbuf, &prox_port_cfg[port_id].eth_addr, &targ->local_ipv6, prox_port_cfg[port_id].vlan_tags[0]);
173 tbase->aux->tx_ctrlplane_pkt(tbase, &mbuf, 1, &out);
174 TASK_STATS_ADD_TX_NON_DP(&tbase->aux->stats, 1);
176 plog_err("Failed to get a mbuf from arp/ndp mempool\n");
180 /* This implementation could be improved: instead of checking each time we send a packet whether we need also
181 to send an ARP, we should only check whether the MAC is valid.
182 We should check arp_ndp_retransmit_timeout in the master process. This would also require the generating task to clear its arp ring
183 to avoid sending many ARP while starting after a long stop.
184 We could also check for reachable_timeout in the master so that dataplane has only to check whether MAC is available
185 but this would require either thread safety, or the the exchange of information between master and generating core.
188 static inline int add_key_and_send_arp(struct rte_hash *ip_hash, uint32_t *ip_dst, struct arp_table *entries, uint64_t tsc, uint64_t hz, uint32_t arp_ndp_retransmit_timeout, prox_next_hop_index_type nh, uint64_t **time)
190 int ret = rte_hash_add_key(ip_hash, (const void *)ip_dst);
191 if (unlikely(ret < 0)) {
192 // No reason to send ARP, as reply would be anyhow ignored
193 plogx_err("Unable to add ip "IPv4_BYTES_FMT" in mac_hash\n", IP4(*ip_dst));
196 entries[ret].ip = *ip_dst;
197 entries[ret].nh = nh;
198 *time = &entries[ret].arp_ndp_retransmit_timeout;
203 static inline int update_mac_and_send_mbuf(struct arp_table *entry, prox_rte_ether_addr *mac, uint64_t tsc, uint64_t hz, uint32_t arp_ndp_retransmit_timeout, uint64_t **time)
205 if (likely((tsc < entry->arp_ndp_retransmit_timeout) && (tsc < entry->reachable_timeout))) {
206 memcpy(mac, &entry->mac, sizeof(prox_rte_ether_addr));
208 } else if (tsc > entry->arp_ndp_retransmit_timeout) {
209 // long time since we have sent an arp, send arp
210 *time = &entry->arp_ndp_retransmit_timeout;
211 if (tsc < entry->reachable_timeout){
212 // MAC is valid in the table => send also the mbuf
213 memcpy(mac, &entry->mac, sizeof(prox_rte_ether_addr));
214 return SEND_MBUF_AND_ARP_ND;
216 // MAC still unknown, or timed out => only send ARP
220 // MAC is unknown and we already sent an ARP recently, drop mbuf and wait for ARP reply
224 int write_dst_mac(struct task_base *tbase, struct rte_mbuf *mbuf, uint32_t *ip_dst, uint16_t *vlan, uint64_t **time, uint64_t tsc)
226 const uint64_t hz = rte_get_tsc_hz();
227 struct ether_hdr_arp *packet = rte_pktmbuf_mtod(mbuf, struct ether_hdr_arp *);
228 prox_rte_ether_addr *mac = &packet->ether_hdr.d_addr;
229 prox_next_hop_index_type next_hop_index;
230 static uint64_t last_tsc = 0, n_no_route = 0;
232 struct l3_base *l3 = &(tbase->l3);
234 // First find the next hop
236 // A routing table was configured
237 // If a gw (gateway_ipv4) is also specified, it is used as default gw only i.e. lowest priority (shortest prefix)
238 // This is implemented automatically through lpm
239 uint16_t len = rte_pktmbuf_pkt_len(mbuf);
240 if (find_ip(packet, len, ip_dst, vlan) != 0) {
241 // Unable to find IP address => non IP packet => send it as it
244 if (unlikely(rte_lpm_lookup(l3->ipv4_lpm, rte_bswap32(*ip_dst), &next_hop_index) != 0)) {
245 // Prevent printing too many messages
247 if (tsc > last_tsc + rte_get_tsc_hz()) {
248 plogx_err("No route to IP "IPv4_BYTES_FMT" (%ld times)\n", IP4(*ip_dst), n_no_route);
254 struct arp_table *entry = &l3->next_hops[next_hop_index];
258 return update_mac_and_send_mbuf(entry, mac, tsc, hz, l3->arp_ndp_retransmit_timeout, time);
261 // no next ip: this is a local route
262 // Find IP in lookup table. Send ARP if not found
263 int ret = rte_hash_lookup(l3->ip_hash, (const void *)ip_dst);
264 if (unlikely(ret < 0)) {
265 // IP not found, try to send an ARP
266 return add_key_and_send_arp(l3->ip_hash, ip_dst, l3->arp_table, tsc, hz, l3->arp_ndp_retransmit_timeout, MAX_HOP_INDEX, time);
268 return update_mac_and_send_mbuf(&l3->arp_table[ret], mac, tsc, hz, l3->arp_ndp_retransmit_timeout, time);
272 // No Routing table specified: only a local ip and maybe a gateway
273 // Old default behavior: if a gw is specified, ALL packets go to this gateway (even those we could send w/o the gw
275 uint16_t len = rte_pktmbuf_pkt_len(mbuf);
277 find_vlan(packet, len, vlan);
278 if (likely((l3->flags & FLAG_DST_MAC_KNOWN) && (tsc < l3->gw.arp_ndp_retransmit_timeout) && (tsc < l3->gw.reachable_timeout))) {
279 memcpy(mac, &l3->gw.mac, sizeof(prox_rte_ether_addr));
281 } else if (tsc > l3->gw.arp_ndp_retransmit_timeout) {
282 // long time since we have successfully sent an arp, send arp
283 // If sending ARP failed (ring full) then arp_ndp_retransmit_timeout is not updated to avoid having to wait 1 sec to send ARP REQ again
284 *time = &l3->gw.arp_ndp_retransmit_timeout;
285 l3->gw.arp_ndp_retransmit_timeout = tsc + l3->arp_ndp_retransmit_timeout * hz / 1000;
288 if ((l3->flags & FLAG_DST_MAC_KNOWN) && (tsc < l3->gw.reachable_timeout)){
289 // MAC is valid in the table => send also the mbuf
290 memcpy(mac, &l3->gw.mac, sizeof(prox_rte_ether_addr));
291 return SEND_MBUF_AND_ARP_ND;
293 // MAC still unknown, or timed out => only send ARP
297 // MAC is unknown and we already sent an ARP recently, drop mbuf and wait for ARP reply
302 if (find_ip(packet, len, ip_dst, vlan) != 0) {
303 // Unable to find IP address => non IP packet => send it as it
306 if (likely(l3->n_pkts < 4)) {
307 for (unsigned int idx = 0; idx < l3->n_pkts; idx++) {
308 if (*ip_dst == l3->optimized_arp_table[idx].ip) {
309 return update_mac_and_send_mbuf(&l3->optimized_arp_table[idx], mac, tsc, hz, l3->arp_ndp_retransmit_timeout, time);
312 // IP address not found in table
313 l3->optimized_arp_table[l3->n_pkts].ip = *ip_dst;
314 *time = &l3->optimized_arp_table[l3->n_pkts].arp_ndp_retransmit_timeout;
317 if (l3->n_pkts < 4) {
321 // We have too many IP addresses to search linearly; lets use hash table instead => copy all entries in hash table
322 for (uint32_t idx = 0; idx < l3->n_pkts; idx++) {
323 uint32_t ip = l3->optimized_arp_table[idx].ip;
324 int ret = rte_hash_add_key(l3->ip_hash, (const void *)&ip);
326 // This should not happen as few entries so far.
327 // If it happens, we still send the ARP as easier:
328 // If the ARP corresponds to this error, the ARP reply will be ignored
329 // If ARP does not correspond to this error/ip, then ARP reply will be handled.
330 plogx_err("Unable add ip "IPv4_BYTES_FMT" in mac_hash (already %d entries)\n", IP4(ip), idx);
332 memcpy(&l3->arp_table[ret], &l3->optimized_arp_table[idx], sizeof(struct arp_table));
337 // Find IP in lookup table. Send ARP if not found
338 int ret = rte_hash_lookup(l3->ip_hash, (const void *)ip_dst);
339 if (unlikely(ret < 0)) {
340 // IP not found, try to send an ARP
341 return add_key_and_send_arp(l3->ip_hash, ip_dst, &l3->arp_table[ret], tsc, hz, l3->arp_ndp_retransmit_timeout, MAX_HOP_INDEX, time);
344 return update_mac_and_send_mbuf(&l3->arp_table[ret], mac, tsc, hz, l3->arp_ndp_retransmit_timeout, time);
351 int write_ip6_dst_mac(struct task_base *tbase, struct rte_mbuf *mbuf, struct ipv6_addr *ip_dst, uint16_t *vlan, uint64_t tsc)
353 const uint64_t hz = rte_get_tsc_hz();
354 prox_rte_ether_hdr *packet = rte_pktmbuf_mtod(mbuf, prox_rte_ether_hdr *);
355 prox_rte_ether_addr *mac = &packet->d_addr;
356 struct ipv6_addr *used_ip_src;
358 uint16_t len = rte_pktmbuf_pkt_len(mbuf);
360 struct ipv6_addr *pkt_src_ip6;
361 if ((pkt_src_ip6 = find_ip6(packet, len, ip_dst, vlan)) == NULL) {
362 // Unable to find IP address => non IP packet => send it as it
365 struct l3_base *l3 = &(tbase->l3);
367 // Configure source IP
368 if (*(uint64_t *)(&l3->local_ipv6) == *(uint64_t *)ip_dst) {
369 // Same prefix as local -> use local
370 used_ip_src = &l3->local_ipv6;
371 } else if (*(uint64_t *)(&l3->global_ipv6) == *(uint64_t *)ip_dst) {
372 // Same prefix as global -> use global
373 used_ip_src = &l3->global_ipv6;
374 } else if (*(__int128 *)(&l3->gw.ip6) != 0) {
375 used_ip_src = &l3->global_ipv6;
376 memcpy(ip_dst, &l3->gw.ip6, sizeof(struct ipv6_addr));
377 } else if (*(__int128 *)(&l3->global_ipv6) != 0) {
378 // Global IP is defined -> use it
379 used_ip_src = &l3->global_ipv6;
381 plog_info("Error as trying to send a packet to "IPv6_BYTES_FMT" using "IPv6_BYTES_FMT" (local)\n", IPv6_BYTES(ip_dst->bytes), IPv6_BYTES(l3->local_ipv6.bytes));
384 rte_memcpy(pkt_src_ip6, used_ip_src, sizeof(struct ipv6_addr));
387 if (likely(l3->n_pkts < 4)) {
388 for (unsigned int idx = 0; idx < l3->n_pkts; idx++) {
389 if (*(__int128 *)ip_dst == *(__int128 *)(&l3->optimized_arp_table[idx].ip6)) {
390 // IP address already in table
391 if ((tsc < l3->optimized_arp_table[idx].arp_ndp_retransmit_timeout) && (tsc < l3->optimized_arp_table[idx].reachable_timeout)) {
392 // MAC address was recently updated in table, use it
393 // plog_dbg("Valid MAC address found => send packet\n");
394 rte_memcpy(mac, &l3->optimized_arp_table[idx].mac, sizeof(prox_rte_ether_addr));
396 } else if (tsc > l3->optimized_arp_table[idx].arp_ndp_retransmit_timeout) {
397 // NDP not sent since a long time, send NDP
398 l3->optimized_arp_table[idx].arp_ndp_retransmit_timeout = tsc + l3->arp_ndp_retransmit_timeout * hz / 1000;
399 if (tsc < l3->optimized_arp_table[idx].reachable_timeout) {
400 // MAC still valid => also send mbuf
401 plog_dbg("Valid MAC found but NDP retransmit timeout => send packet and NDP\n");
402 memcpy(mac, &l3->optimized_arp_table[idx].mac, sizeof(prox_rte_ether_addr));
403 return SEND_MBUF_AND_ARP_ND;
405 plog_dbg("Unknown MAC => send NDP but cannot send packet\n");
406 // MAC unvalid => only send NDP
410 // NDP timeout elapsed, MAC not valid anymore but waiting for NDP reply
411 // plog_dbg("NDP reachable timeout elapsed - waiting for NDP reply\n");
416 // IP address not found in table
417 memcpy(&l3->optimized_arp_table[l3->n_pkts].ip6, ip_dst, sizeof(struct ipv6_addr));
418 l3->optimized_arp_table[l3->n_pkts].arp_ndp_retransmit_timeout = tsc + l3->arp_ndp_retransmit_timeout * hz / 1000;
421 if (l3->n_pkts < 4) {
425 // We have too many IP addresses to search linearly; lets use hash table instead => copy all entries in hash table
426 for (uint32_t idx = 0; idx < l3->n_pkts; idx++) {
427 struct ipv6_addr *ip6 = &l3->optimized_arp_table[idx].ip6;
428 int ret = rte_hash_add_key(l3->ip6_hash, (const void *)ip6);
430 // This should not happen as few entries so far.
431 // If it happens, we still send the NDP as easier:
432 // If the NDP corresponds to this error, the NDP reply will be ignored
433 // If NDP does not correspond to this error/ip, then NDP reply will be handled.
434 plogx_err("Unable add ip "IPv6_BYTES_FMT" in mac_hash (already %d entries)\n", IPv6_BYTES(ip6->bytes), idx);
436 memcpy(&l3->arp_table[ret], &l3->optimized_arp_table[idx], sizeof(struct arp_table));
441 // Find IP in lookup table. Send ND if not found
442 int ret = rte_hash_lookup(l3->ip6_hash, (const void *)ip_dst);
443 if (unlikely(ret < 0)) {
444 // IP not found, try to send an ND
445 int ret = rte_hash_add_key(l3->ip6_hash, (const void *)ip_dst);
447 // No reason to send NDP, as reply would be anyhow ignored
448 plogx_err("Unable to add ip "IPv6_BYTES_FMT" in mac_hash\n", IPv6_BYTES(ip_dst->bytes));
451 memcpy(&l3->arp_table[ret].ip6, ip_dst, sizeof(struct ipv6_addr));
452 l3->arp_table[ret].arp_ndp_retransmit_timeout = tsc + l3->arp_ndp_retransmit_timeout * hz / 1000;
457 if (likely((tsc < l3->arp_table[ret].arp_ndp_retransmit_timeout) && (tsc < l3->arp_table[ret].reachable_timeout))) {
458 // MAC still valid and NDP sent recently
459 memcpy(mac, &l3->arp_table[ret].mac, sizeof(prox_rte_ether_addr));
461 } else if (tsc > l3->arp_table[ret].arp_ndp_retransmit_timeout) {
462 // NDP not sent since a long time, send NDP
463 l3->arp_table[ret].arp_ndp_retransmit_timeout = tsc + l3->arp_ndp_retransmit_timeout * hz / 1000;
464 if (tsc < l3->arp_table[ret].reachable_timeout) {
465 // MAC still valid => send also MBUF
466 memcpy(mac, &l3->arp_table[ret].mac, sizeof(prox_rte_ether_addr));
467 return SEND_MBUF_AND_ARP_ND;
480 void task_init_l3(struct task_base *tbase, struct task_args *targ)
482 static char hash_name[30];
483 uint32_t n_entries = MAX_ARP_ENTRIES * 4;
484 const int socket_id = rte_lcore_to_socket_id(targ->lconf->id);
485 sprintf(hash_name, "A%03d_%03d_mac_table", targ->lconf->id, targ->id);
489 struct rte_hash_parameters hash_params = {
491 .entries = n_entries,
492 .key_len = sizeof(uint32_t),
493 .hash_func = rte_hash_crc,
494 .hash_func_init_val = 0,
496 if (targ->flags & TASK_ARG_L3) {
497 plog_info("\t\tInitializing L3 (IPv4)\n");
498 tbase->l3.ip_hash = rte_hash_create(&hash_params);
499 PROX_PANIC(tbase->l3.ip_hash == NULL, "Failed to set up ip hash table\n");
503 if (targ->flags & TASK_ARG_NDP) {
504 plog_info("\t\tInitializing NDP (IPv6)\n");
505 hash_params.key_len = sizeof(struct ipv6_addr);
506 tbase->l3.ip6_hash = rte_hash_create(&hash_params);
507 PROX_PANIC(tbase->l3.ip6_hash == NULL, "Failed to set up ip hash table\n");
509 tbase->l3.arp_table = (struct arp_table *)prox_zmalloc(n_entries * sizeof(struct arp_table), socket_id);
510 PROX_PANIC(tbase->l3.arp_table == NULL, "Failed to allocate memory for %u entries in arp/ndp table\n", n_entries);
511 plog_info("\t\tarp/ndp table, with %d entries of size %ld\n", n_entries, sizeof(struct l3_base));
513 targ->lconf->ctrl_func_p[targ->task] = handle_ctrl_plane_pkts;
514 targ->lconf->ctrl_timeout = freq_to_tsc(targ->ctrl_freq);
515 tbase->l3.gw.ip = rte_cpu_to_be_32(targ->gateway_ipv4);
516 memcpy(&tbase->l3.gw.ip6, &targ->gateway_ipv6, sizeof(struct ipv6_addr));
517 tbase->flags |= TASK_L3;
518 tbase->l3.core_id = targ->lconf->id;
519 tbase->l3.task_id = targ->id;
520 tbase->l3.tmaster = targ->tmaster;
521 tbase->l3.seed = (uint)rte_rdtsc();
522 if (targ->reachable_timeout != 0)
523 tbase->l3.reachable_timeout = targ->reachable_timeout;
525 tbase->l3.reachable_timeout = DEFAULT_ARP_TIMEOUT;
526 if (targ->arp_ndp_retransmit_timeout != 0)
527 tbase->l3.arp_ndp_retransmit_timeout = targ->arp_ndp_retransmit_timeout;
529 tbase->l3.arp_ndp_retransmit_timeout = DEFAULT_ARP_UPDATE_TIME;
532 void task_start_l3(struct task_base *tbase, struct task_args *targ)
534 const int socket_id = rte_lcore_to_socket_id(targ->lconf->id);
535 const int NB_ARP_ND_MBUF = 1024;
536 const int ARP_ND_MBUF_SIZE = 2048;
537 const int NB_CACHE_ARP_ND_MBUF = 256;
539 struct prox_port_cfg *port = find_reachable_port(targ);
540 if (port && (tbase->l3.arp_nd_pool == NULL)) {
541 static char name[] = "arp0_pool";
542 tbase->l3.reachable_port_id = port - prox_port_cfg;
543 if ((targ->local_ipv4 && port->ip_addr[0].ip) && (targ->local_ipv4 != port->ip_addr[0].ip)) {
544 PROX_PANIC(1, "local_ipv4 in core section ("IPv4_BYTES_FMT") differs from port section ("IPv4_BYTES_FMT")\n", IP4(rte_be_to_cpu_32(targ->local_ipv4)), IP4(rte_be_to_cpu_32(port->ip_addr[0].ip)));
546 if ((targ->local_ipv4 && port->ip_addr[0].ip) && (targ->local_prefix != port->ip_addr[0].prefix)) {
547 PROX_PANIC(1, "local_ipv4 prefix in core section (%d) differs from port section (%d)\n", targ->local_prefix, port->ip_addr[0].prefix);
549 if (!port->ip_addr[0].ip) {
550 port->ip_addr[0].ip = targ->local_ipv4;
551 port->ip_addr[0].prefix = targ->local_prefix;
553 port->vlan_tags[0] = 0;
554 plog_info("Setting port local_ipv4 from core %d local_ipv4 to "IPv4_BYTES_FMT"\n", tbase->l3.reachable_port_id, IP4(rte_be_to_cpu_32(port->ip_addr[0].ip)));
556 for (int vlan_id = 0; vlan_id < port->n_vlans; vlan_id++) {
557 register_ip_to_ctrl_plane(tbase->l3.tmaster, rte_be_to_cpu_32(port->ip_addr[vlan_id].ip), tbase->l3.reachable_port_id, targ->lconf->id, targ->id);
559 if (strcmp(targ->route_table, "") != 0) {
563 PROX_PANIC(port->n_vlans == 0, "missing local_ipv4 while route table is specified in L3 mode\n");
565 // LPM might be modified runtime => do not share with other cores
566 ret = lua_to_lpm4(prox_lua(), GLOBAL, targ->route_table, socket_id, &lpm);
567 PROX_PANIC(ret, "Failed to load IPv4 LPM:\n%s\n", get_lua_to_errors());
569 tbase->l3.ipv4_lpm = lpm->rte_lpm;
570 tbase->l3.next_hops = prox_zmalloc(sizeof(*tbase->l3.next_hops) * MAX_HOP_INDEX, socket_id);
571 PROX_PANIC(tbase->l3.next_hops == NULL, "Could not allocate memory for next hop\n");
573 for (uint32_t i = 0; i < MAX_HOP_INDEX; i++) {
574 if (!lpm->next_hops[i].ip_dst)
577 tbase->l3.next_hops[i].ip = rte_bswap32(lpm->next_hops[i].ip_dst);
578 int tx_port = lpm->next_hops[i].mac_port.out_idx;
579 // gen only supports one port right now .... hence port = 0
580 if ((tx_port > targ->nb_txports - 1) && (tx_port > targ->nb_txrings - 1)) {
581 PROX_PANIC(1, "Routing Table contains port %d but only %d tx port/ %d ring:\n", tx_port, targ->nb_txports, targ->nb_txrings);
584 plog_info("Using routing table %s in l3 mode, with %d gateways\n", targ->route_table, tbase->l3.nb_gws);
586 // Last but one (x n_vlans) "next_hop_index" is not a gateway but direct routes
587 for (int vlan_id = 0; vlan_id < port->n_vlans; vlan_id++) {
588 tbase->l3.next_hops[tbase->l3.nb_gws].ip = 0;
589 ret = rte_lpm_add(tbase->l3.ipv4_lpm, port->ip_addr[vlan_id].ip, port->ip_addr[vlan_id].prefix, tbase->l3.nb_gws++);
590 PROX_PANIC(ret, "Failed to add local_ipv4 "IPv4_BYTES_FMT"/%d to lpm\n", IP4(port->ip_addr[vlan_id].ip), port->ip_addr[vlan_id].prefix);
593 // Last "next_hop_index" is default gw
594 tbase->l3.next_hops[tbase->l3.nb_gws].ip = rte_bswap32(targ->gateway_ipv4);
595 if (targ->gateway_ipv4) {
596 ret = rte_lpm_add(tbase->l3.ipv4_lpm, targ->gateway_ipv4, 0, tbase->l3.nb_gws++);
597 PROX_PANIC(ret, "Failed to add gateway_ipv4 "IPv4_BYTES_FMT"/%d to lpm\n", IP4(tbase->l3.gw.ip), 0);
601 master_init_vdev(tbase->l3.tmaster, tbase->l3.reachable_port_id, targ->lconf->id, targ->id);
603 // Create IPv6 addr if none were configured
604 if (targ->flags & TASK_ARG_NDP) {
605 if (!memcmp(&targ->local_ipv6, &null_addr, sizeof(struct ipv6_addr))) {
606 set_link_local(&targ->local_ipv6);
607 set_EUI(&targ->local_ipv6, &port->eth_addr);
609 plog_info("\tCore %d, task %d, local IPv6 addr is "IPv6_BYTES_FMT" (%s)\n",
610 targ->lconf->id, targ->id,
611 IPv6_BYTES(targ->local_ipv6.bytes),
612 IP6_Canonical(&targ->local_ipv6));
613 memcpy(&tbase->l3.local_ipv6, &targ->local_ipv6, sizeof(struct ipv6_addr));
615 if (memcmp(&targ->global_ipv6, &null_addr, sizeof(struct ipv6_addr))) {
616 memcpy(&tbase->l3.global_ipv6, &targ->global_ipv6, sizeof(struct ipv6_addr));
617 plog_info("\tCore %d, task %d, global IPv6 addr is "IPv6_BYTES_FMT" (%s)\n",
618 targ->lconf->id, targ->id,
619 IPv6_BYTES(targ->global_ipv6.bytes),
620 IP6_Canonical(&targ->global_ipv6));
622 if (targ->ipv6_router)
623 register_router_to_ctrl_plane(tbase->l3.tmaster, tbase->l3.reachable_port_id, targ->lconf->id, targ->id, &targ->local_ipv6, &targ->global_ipv6, &targ->router_prefix);
625 register_node_to_ctrl_plane(tbase->l3.tmaster, &targ->local_ipv6, &targ->global_ipv6, tbase->l3.reachable_port_id, targ->lconf->id, targ->id);
629 struct rte_mempool *ret = rte_mempool_create(name, NB_ARP_ND_MBUF, ARP_ND_MBUF_SIZE, NB_CACHE_ARP_ND_MBUF,
630 sizeof(struct rte_pktmbuf_pool_private), rte_pktmbuf_pool_init, NULL, rte_pktmbuf_init, 0,
632 PROX_PANIC(ret == NULL, "Failed to allocate ARP/ND memory pool on socket %u with %u elements\n",
633 rte_socket_id(), NB_ARP_ND_MBUF);
634 plog_info("\tMempool %p (%s) size = %u * %u cache %u, socket %d (for ARP/ND)\n", ret, name, NB_ARP_ND_MBUF,
635 ARP_ND_MBUF_SIZE, NB_CACHE_ARP_ND_MBUF, rte_socket_id());
636 tbase->l3.arp_nd_pool = ret;
637 if ((targ->flags & TASK_ARG_NDP) && (!targ->ipv6_router)) {
638 plog_info("Sending Router Sollicitation\n");
639 send_router_sollicitation(tbase, targ);
641 if ((targ->flags & TASK_ARG_NDP) && (targ->flags & TASK_ARG_SEND_NA_AT_STARTUP)) {
642 plog_info("Sending unsollicited Neighbour Advertisement\n");
643 send_unsollicited_neighbour_advertisement(tbase);
649 void task_set_gateway_ip(struct task_base *tbase, uint32_t ip)
651 tbase->l3.gw.ip = ip;
652 tbase->flags &= ~FLAG_DST_MAC_KNOWN;
655 static void reset_arp_ndp_retransmit_timeout(struct l3_base *l3, uint32_t ip)
658 plogx_dbg("MAC entry for IP "IPv4_BYTES_FMT" timeout in kernel\n", IP4(ip));
661 int ret = rte_hash_lookup(l3->ip_hash, (const void *)&ip);
663 l3->arp_table[ret].arp_ndp_retransmit_timeout = 0;
664 } else if (ip == l3->gw.ip) {
665 l3->gw.arp_ndp_retransmit_timeout = 0;
666 } else if (l3->n_pkts < 4) {
667 for (idx = 0; idx < l3->n_pkts; idx++) {
668 uint32_t ip_dst = l3->optimized_arp_table[idx].ip;
672 if (idx < l3->n_pkts) {
673 l3->optimized_arp_table[idx].arp_ndp_retransmit_timeout = 0;
676 int ret = rte_hash_lookup(l3->ip_hash, (const void *)&ip);
678 l3->arp_table[ret].arp_ndp_retransmit_timeout = 0;
683 static prox_next_hop_index_type get_nh_index(struct task_base *tbase, uint32_t gw_ip)
685 // Check if gateway already exists
686 for (prox_next_hop_index_type i = 0; i < tbase->l3.nb_gws; i++) {
687 if (tbase->l3.next_hops[i].ip == gw_ip) {
691 if (tbase->l3.nb_gws < MAX_HOP_INDEX) {
692 tbase->l3.next_hops[tbase->l3.nb_gws].ip = gw_ip;
694 return tbase->l3.nb_gws - 1;
696 return MAX_HOP_INDEX;
698 void handle_ctrl_plane_pkts(struct task_base *tbase, struct rte_mbuf **mbufs, uint16_t n_pkts)
701 const uint64_t hz = rte_get_tsc_hz();
702 uint32_t ip, ip_dst, idx, gateway_ip, prefix;
703 prox_next_hop_index_type gateway_index;
704 int j, ret, modified_route;
706 struct ipv6_addr *ip6, *ip6_dst;
708 prox_rte_ether_hdr *hdr;
709 struct ether_hdr_arp *hdr_arp;
710 struct l3_base *l3 = &tbase->l3;
711 uint64_t tsc= rte_rdtsc();
712 uint64_t reachable_timeout = l3->reachable_timeout * hz / 1000;
714 prox_rte_ipv4_hdr *pip;
715 prox_rte_udp_hdr *udp_hdr;
716 uint8_t port = tbase->l3.reachable_port_id;
718 for (j = 0; j < n_pkts; ++j) {
721 for (j = 0; j < n_pkts; ++j) {
722 PREFETCH0(rte_pktmbuf_mtod(mbufs[j], void *));
725 for (j = 0; j < n_pkts; ++j) {
728 out[0] = OUT_HANDLED;
729 command = get_command(mbufs[j]);
730 plogx_dbg("\tReceived %s mbuf %p\n", actions_string[command], mbufs[j]);
732 case ROUTE_ADD_FROM_MASTER:
733 ip = ctrl_ring_get_ip(mbufs[j]);
734 gateway_ip = ctrl_ring_get_gateway_ip(mbufs[j]);
735 prefix = ctrl_ring_get_prefix(mbufs[j]);
736 gateway_index = get_nh_index(tbase, gateway_ip);
737 if (gateway_index >= MAX_HOP_INDEX) {
738 plog_err("Unable to find or define gateway index - too many\n");
741 modified_route = rte_lpm_is_rule_present(tbase->l3.ipv4_lpm, rte_bswap32(ip), prefix, &nh);
742 ret = rte_lpm_add(tbase->l3.ipv4_lpm, rte_bswap32(ip), prefix, gateway_index);
744 plog_err("Failed to add route to "IPv4_BYTES_FMT"/%d using "IPv4_BYTES_FMT"(index = %d)\n", IP4(ip), prefix, IP4(gateway_ip), gateway_index);
745 } else if (modified_route)
746 plogx_dbg("Modified route to "IPv4_BYTES_FMT"/%d using "IPv4_BYTES_FMT"(index = %d) (was using "IPv4_BYTES_FMT"(index = %d)\n", IP4(ip), prefix, IP4(gateway_ip), gateway_index, IP4(tbase->l3.next_hops[nh].ip), nh);
748 plogx_dbg("Added new route to "IPv4_BYTES_FMT"/%d using "IPv4_BYTES_FMT"(index = %d)\n", IP4(ip), prefix, IP4(gateway_ip), gateway_index);
752 case ROUTE_DEL_FROM_MASTER:
753 ip = ctrl_ring_get_ip(mbufs[j]);
754 prefix = ctrl_ring_get_prefix(mbufs[j]);
756 ret = rte_lpm_is_rule_present(tbase->l3.ipv4_lpm, rte_bswap32(ip), prefix, &nh);
758 ret = rte_lpm_delete(tbase->l3.ipv4_lpm, rte_bswap32(ip), prefix);
760 plog_err("Failed to add rule\n");
762 plog_info("Deleting route to "IPv4_BYTES_FMT"/%d\n", IP4(ip), prefix);
766 case MAC_INFO_FROM_MASTER:
767 hdr_arp = rte_pktmbuf_mtod(mbufs[j], struct ether_hdr_arp *);
768 ip = get_ip(mbufs[j]);
770 if (prox_rte_is_zero_ether_addr(&hdr_arp->arp.data.sha)) {
771 // MAC timeout or deleted from kernel table => reset update_time
772 // This will cause us to send new ARP request
773 // However, as reachable_timeout not touched, we should continue sending our regular IP packets
774 reset_arp_ndp_retransmit_timeout(l3, ip);
777 plogx_dbg("\tUpdating MAC entry for IP "IPv4_BYTES_FMT" with MAC "MAC_BYTES_FMT"\n",
778 IP4(ip), MAC_BYTES(hdr_arp->arp.data.sha.addr_bytes));
782 struct arp_table *entry;
783 ret = rte_hash_add_key(l3->ip_hash, (const void *)&ip);
785 plogx_info("Unable add ip "IPv4_BYTES_FMT" in mac_hash\n", IP4(ip));
786 } else if ((nh = l3->arp_table[ret].nh) != MAX_HOP_INDEX) {
787 entry = &l3->next_hops[nh];
788 memcpy(&entry->mac, &(hdr_arp->arp.data.sha), sizeof(prox_rte_ether_addr));
789 entry->reachable_timeout = tsc + reachable_timeout;
790 update_arp_ndp_retransmit_timeout(l3, &entry->arp_ndp_retransmit_timeout, l3->arp_ndp_retransmit_timeout);
792 memcpy(&l3->arp_table[ret].mac, &(hdr_arp->arp.data.sha), sizeof(prox_rte_ether_addr));
793 l3->arp_table[ret].reachable_timeout = tsc + reachable_timeout;
794 update_arp_ndp_retransmit_timeout(l3, &l3->arp_table[ret].arp_ndp_retransmit_timeout, l3->arp_ndp_retransmit_timeout);
797 else if (ip == l3->gw.ip) {
798 // MAC address of the gateway
799 memcpy(&l3->gw.mac, &hdr_arp->arp.data.sha, 6);
800 l3->flags |= FLAG_DST_MAC_KNOWN;
801 l3->gw.reachable_timeout = tsc + reachable_timeout;
802 update_arp_ndp_retransmit_timeout(l3, &l3->gw.arp_ndp_retransmit_timeout, l3->arp_ndp_retransmit_timeout);
803 } else if (l3->n_pkts < 4) {
804 // Few packets tracked - should be faster to loop through them thean using a hash table
805 for (idx = 0; idx < l3->n_pkts; idx++) {
806 ip_dst = l3->optimized_arp_table[idx].ip;
810 if (idx < l3->n_pkts) {
811 memcpy(&l3->optimized_arp_table[idx].mac, &(hdr_arp->arp.data.sha), sizeof(prox_rte_ether_addr));
812 l3->optimized_arp_table[idx].reachable_timeout = tsc + reachable_timeout;
813 update_arp_ndp_retransmit_timeout(l3, &l3->optimized_arp_table[idx].arp_ndp_retransmit_timeout, l3->arp_ndp_retransmit_timeout);
816 ret = rte_hash_add_key(l3->ip_hash, (const void *)&ip);
818 plogx_info("Unable add ip "IPv4_BYTES_FMT" in mac_hash\n", IP4(ip));
820 memcpy(&l3->arp_table[ret].mac, &(hdr_arp->arp.data.sha), sizeof(prox_rte_ether_addr));
821 l3->arp_table[ret].reachable_timeout = tsc + reachable_timeout;
822 update_arp_ndp_retransmit_timeout(l3, &l3->arp_table[ret].arp_ndp_retransmit_timeout, l3->arp_ndp_retransmit_timeout);
827 case MAC_INFO_FROM_MASTER_FOR_IPV6:
828 ip6 = ctrl_ring_get_ipv6_addr(mbufs[j]);
829 uint64_t data = ctrl_ring_get_data(mbufs[j]);
831 if (l3->n_pkts < 4) {
832 // Few packets tracked - should be faster to loop through them thean using a hash table
833 for (idx = 0; idx < l3->n_pkts; idx++) {
834 ip6_dst = &l3->optimized_arp_table[idx].ip6;
835 if (memcmp(ip6_dst, ip6, sizeof(struct ipv6_addr)) == 0)
838 if (idx < l3->n_pkts) {
839 // IP found; this is a reply for one of our requests!
840 memcpy(&l3->optimized_arp_table[idx].mac, &data, sizeof(prox_rte_ether_addr));
841 l3->optimized_arp_table[idx].reachable_timeout = tsc + l3->reachable_timeout * hz / 1000;
844 int ret = rte_hash_add_key(l3->ip6_hash, (const void *)ip6);
846 plogx_info("Unable add ip "IPv6_BYTES_FMT" in mac_hash\n", IPv6_BYTES(ip6->bytes));
848 memcpy(&l3->arp_table[ret].mac, &data, sizeof(prox_rte_ether_addr));
849 l3->arp_table[ret].reachable_timeout = tsc + l3->reachable_timeout * hz / 1000;
854 case SEND_NDP_FROM_MASTER:
855 case SEND_ARP_REQUEST_FROM_MASTER:
856 case SEND_ARP_REPLY_FROM_MASTER:
858 // tx_ctrlplane_pkt does not drop packets
859 plogx_dbg("\tForwarding (ARP) packet from master\n");
860 tbase->aux->tx_ctrlplane_pkt(tbase, &mbufs[j], 1, out);
861 TASK_STATS_ADD_TX_NON_DP(&tbase->aux->stats, 1);
863 case SEND_ICMP_FROM_MASTER:
865 // tx_ctrlplane_pkt does not drop packets
866 plogx_dbg("\tForwarding (PING) packet from master\n");
867 tbase->aux->tx_ctrlplane_pkt(tbase, &mbufs[j], 1, out);
868 TASK_STATS_ADD_TX_NON_DP(&tbase->aux->stats, 1);
871 // Drop Pseudo packets sent to generate ARP requests
872 // There are other IPv4 packets sent from TAP which we cannot delete e.g. BGP packets
874 hdr = rte_pktmbuf_mtod(mbufs[j], prox_rte_ether_hdr *);
875 if (hdr->ether_type == ETYPE_IPv4) {
876 pip = (prox_rte_ipv4_hdr *)(hdr + 1);
877 } else if (hdr->ether_type == ETYPE_VLAN) {
878 prox_rte_vlan_hdr *vlan = (prox_rte_vlan_hdr *)(hdr + 1);
879 vlan = (prox_rte_vlan_hdr *)(hdr + 1);
880 if (vlan->eth_proto == ETYPE_IPv4) {
881 pip = (prox_rte_ipv4_hdr *)(vlan + 1);
884 if (pip && (pip->next_proto_id == IPPROTO_UDP)) {
885 udp_hdr = (prox_rte_udp_hdr *)(pip + 1);
886 if ((udp_hdr->dst_port == rte_cpu_to_be_16(PROX_PSEUDO_PKT_PORT)) &&
887 (udp_hdr->src_port == rte_cpu_to_be_16(PROX_PSEUDO_PKT_PORT)) &&
888 (rte_be_to_cpu_16(udp_hdr->dgram_len) == 8)) {
889 plogx_dbg("Dropping PROX packet\n");
895 uint16_t src_port = 0, dst_port = 0, len = 0;
897 src_port = udp_hdr->src_port;
898 dst_port = udp_hdr->dst_port;
899 len = rte_be_to_cpu_16(udp_hdr->dgram_len);
901 plogx_dbg("tForwarding TAP packet from master. Type = %x, pip=%p, udp = %p, udp = {src = %x, dst = %x, len = %d}\n", hdr->ether_type, pip, udp_hdr, src_port, dst_port,len );
903 // tx_ctrlplane_pkt does not drop packets
904 tbase->aux->tx_ctrlplane_pkt(tbase, &mbufs[j], 1, out);
905 TASK_STATS_ADD_TX_NON_DP(&tbase->aux->stats, 1);
907 case IPV6_INFO_FROM_MASTER:
908 // addr = ctrl_ring_get_data(mbufs[j]);
909 ip6 = ctrl_ring_get_ipv6_addr(mbufs[j]);
910 if (memcmp(&l3->global_ipv6 , &null_addr, 16) == 0) {
911 memcpy(&l3->global_ipv6, ip6, sizeof(struct ipv6_addr));
912 plog_info("Core %d task %d received global IP "IPv6_BYTES_FMT"\n", l3->core_id, l3->task_id, IPv6_BYTES(ip6->bytes));
913 } else if (memcmp(&l3->global_ipv6, ip6, 8) == 0) {
914 if (l3->prefix_printed == 0) {
915 plog_info("Core %d task %d received expected prefix "IPv6_PREFIX_FMT"\n", l3->core_id, l3->task_id, IPv6_PREFIX(ip6->bytes));
916 l3->prefix_printed = 1;
919 plog_warn("Core %d task %d received unexpected prefix "IPv6_PREFIX_FMT", IP = "IPv6_PREFIX_FMT"\n", l3->core_id, l3->task_id, IPv6_PREFIX(ip6->bytes), IPv6_PREFIX(l3->global_ipv6.bytes));
924 plog_err("Unexpected message received: %d\n", command);