Improve ctrlplane performance
[samplevnf.git] / VNFs / DPPD-PROX / packet_utils.c
index ff00fb4..e06529c 100644 (file)
@@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2010-2017 Intel Corporation
+// Copyright (c) 2010-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 #include "lconf.h"
 #include "prefetch.h"
 #include "log.h"
+#include "defines.h"
 #include "handle_master.h"
 #include "prox_port_cfg.h"
+#include "packet_utils.h"
 
 static inline int find_ip(struct ether_hdr_arp *pkt, uint16_t len, uint32_t *ip_dst)
 {
-       struct vlan_hdr *vlan_hdr;
-       struct ether_hdr *eth_hdr = (struct ether_hdr*)pkt;
-       struct ipv4_hdr *ip;
+       prox_rte_vlan_hdr *vlan_hdr;
+       prox_rte_ether_hdr *eth_hdr = (prox_rte_ether_hdr*)pkt;
+       prox_rte_ipv4_hdr *ip;
        uint16_t ether_type = eth_hdr->ether_type;
-       uint16_t l2_len = sizeof(struct ether_hdr);
+       uint16_t l2_len = sizeof(prox_rte_ether_hdr);
 
        // Unstack VLAN tags
-       while (((ether_type == ETYPE_8021ad) || (ether_type == ETYPE_VLAN)) && (l2_len + sizeof(struct vlan_hdr) < len)) {
-               vlan_hdr = (struct vlan_hdr *)((uint8_t *)pkt + l2_len);
+       while (((ether_type == ETYPE_8021ad) || (ether_type == ETYPE_VLAN)) && (l2_len + sizeof(prox_rte_vlan_hdr) < len)) {
+               vlan_hdr = (prox_rte_vlan_hdr *)((uint8_t *)pkt + l2_len);
                l2_len +=4;
                ether_type = vlan_hdr->eth_proto;
        }
@@ -58,8 +60,8 @@ static inline int find_ip(struct ether_hdr_arp *pkt, uint16_t len, uint32_t *ip_
                break;
        }
 
-       if (l2_len && (l2_len + sizeof(struct ipv4_hdr) <= len)) {
-               struct ipv4_hdr *ip = (struct ipv4_hdr *)((uint8_t *)pkt + l2_len);
+       if (l2_len && (l2_len + sizeof(prox_rte_ipv4_hdr) <= len)) {
+               prox_rte_ipv4_hdr *ip = (prox_rte_ipv4_hdr *)((uint8_t *)pkt + l2_len);
                // TODO: implement LPM => replace ip_dst by next hop IP DST
                *ip_dst = ip->dst_addr;
                return 0;
@@ -67,89 +69,137 @@ static inline int find_ip(struct ether_hdr_arp *pkt, uint16_t len, uint32_t *ip_
        return -1;
 }
 
-int write_dst_mac(struct task_base *tbase, struct rte_mbuf *mbuf, uint32_t *ip_dst)
+/* This implementation could be improved: instead of checking each time we send a packet whether we need also
+   to send an ARP, we should only check whether the MAC is valid.
+   We should check arp_update_time in the master process. This would also require the generating task to clear its arp ring
+   to avoid sending many ARP while starting after a long stop.
+   We could also check for arp_timeout in the master so that dataplane has only to check whether MAC is available
+   but this would require either thread safety, or the the exchange of information between master and generating core.
+*/
+
+int write_dst_mac(struct task_base *tbase, struct rte_mbuf *mbuf, uint32_t *ip_dst, uint64_t **time)
 {
        const uint64_t hz = rte_get_tsc_hz();
        struct ether_hdr_arp *packet = rte_pktmbuf_mtod(mbuf, struct ether_hdr_arp *);
-       struct ether_addr *mac = &packet->ether_hdr.d_addr;
+       prox_rte_ether_addr *mac = &packet->ether_hdr.d_addr;
 
        uint64_t tsc = rte_rdtsc();
        struct l3_base *l3 = &(tbase->l3);
        if (l3->gw.ip) {
                if (likely((l3->flags & FLAG_DST_MAC_KNOWN) && (tsc < l3->gw.arp_update_time) && (tsc < l3->gw.arp_timeout))) {
-                       memcpy(mac, &l3->gw.mac, sizeof(struct ether_addr));
-                       return 0;
+                       memcpy(mac, &l3->gw.mac, sizeof(prox_rte_ether_addr));
+                       return SEND_MBUF;
                } else if (tsc > l3->gw.arp_update_time) {
-                       // long time since we have sent an arp, send arp
-                       l3->gw.arp_update_time = tsc + hz;
+                       // long time since we have successfully sent an arp, send arp
+                       // If sending ARP failed (ring full) then arp_update_time is not updated to avoid having to wait 1 sec to send ARP REQ again
+                       *time = &l3->gw.arp_update_time;
                        *ip_dst = l3->gw.ip;
-                       return -1;
+                       if ((l3->flags & FLAG_DST_MAC_KNOWN) && (tsc < l3->gw.arp_timeout)){
+                               // MAC is valid in the table => send also the mbuf
+                               memcpy(mac, &l3->gw.mac, sizeof(prox_rte_ether_addr));
+                               return SEND_MBUF_AND_ARP;
+                       } else {
+                               // MAC still unknown, or timed out => only send ARP
+                               return SEND_ARP;
+                       }
+               } else {
+                       // MAC is unknown and we already sent an ARP recently, drop mbuf and wait for ARP reply
+                       return DROP_MBUF;
                }
-               return -2;
        }
 
        uint16_t len = rte_pktmbuf_pkt_len(mbuf);
        if (find_ip(packet, len, ip_dst) != 0) {
-               return 0;
+               // Unable to find IP address => non IP packet => send it as it
+               return SEND_MBUF;
        }
        if (likely(l3->n_pkts < 4)) {
                for (unsigned int idx = 0; idx < l3->n_pkts; idx++) {
                        if (*ip_dst == l3->optimized_arp_table[idx].ip) {
+                                // IP address already in table
                                if ((tsc < l3->optimized_arp_table[idx].arp_update_time) && (tsc < l3->optimized_arp_table[idx].arp_timeout)) {
-                                       memcpy(mac, &l3->optimized_arp_table[idx].mac, sizeof(struct ether_addr));
-                                       return 0;
+                                       // MAC address was recently updated in table, use it
+                                       memcpy(mac, &l3->optimized_arp_table[idx].mac, sizeof(prox_rte_ether_addr));
+                                       return SEND_MBUF;
                                } else if (tsc > l3->optimized_arp_table[idx].arp_update_time) {
-                                       l3->optimized_arp_table[idx].arp_update_time = tsc + hz;
-                                       return -1;
+                                       // ARP not sent since a long time, send ARP
+                                               *time = &l3->optimized_arp_table[idx].arp_update_time;
+                                       if (tsc < l3->optimized_arp_table[idx].arp_timeout) {
+                                               // MAC still valid => also send mbuf
+                                               memcpy(mac, &l3->optimized_arp_table[idx].mac, sizeof(prox_rte_ether_addr));
+                                               return SEND_MBUF_AND_ARP;
+                                       } else {
+                                               // MAC unvalid => only send ARP
+                                               return SEND_ARP;
+                                       }
                                } else {
-                                       return -2;
+                                       //  ARP timeout elapsed, MAC not valid anymore but waiting for ARP reply
+                                       return DROP_MBUF;
                                }
                        }
                }
+               // IP address not found in table
                l3->optimized_arp_table[l3->n_pkts].ip = *ip_dst;
-               l3->optimized_arp_table[l3->n_pkts].arp_update_time = tsc + hz;
+               *time = &l3->optimized_arp_table[l3->n_pkts].arp_update_time;
                l3->n_pkts++;
 
-               if (l3->n_pkts < 4)
-                       return -1;
+               if (l3->n_pkts < 4) {
+                       return SEND_ARP;
+               }
 
-               // We have ** many ** IP addresses; lets use hash table instead
+               // We have too many IP addresses to search linearly; lets use hash table instead => copy all entries in hash table
                for (uint32_t idx = 0; idx < l3->n_pkts; idx++) {
                        uint32_t ip = l3->optimized_arp_table[idx].ip;
                        int ret = rte_hash_add_key(l3->ip_hash, (const void *)&ip);
                        if (ret < 0) {
-                               plogx_info("Unable add ip %d.%d.%d.%d in mac_hash\n", IP4(ip));
+                               // This should not happen as few entries so far.
+                               // If it happens, we still send the ARP as easier:
+                               //      If the ARP corresponds to this error, the ARP reply will be ignored
+                               //      If ARP does not correspond to this error/ip, then ARP reply will be handled.
+                               plogx_err("Unable add ip %d.%d.%d.%d in mac_hash (already %d entries)\n", IP4(ip), idx);
                        } else {
                                memcpy(&l3->arp_table[ret], &l3->optimized_arp_table[idx], sizeof(struct arp_table));
                        }
                }
-               return -1;
+               return SEND_ARP;
        } else {
-               // Find mac in lookup table. Send ARP if not found
+               // Find IP in lookup table. Send ARP if not found
                int ret = rte_hash_lookup(l3->ip_hash, (const void *)ip_dst);
                if (unlikely(ret < 0)) {
+                       // IP not found, try to send an ARP
                        int ret = rte_hash_add_key(l3->ip_hash, (const void *)ip_dst);
                        if (ret < 0) {
-                               plogx_info("Unable add ip %d.%d.%d.%d in mac_hash\n", IP4(*ip_dst));
-                               return -2;
+                               // No reason to send ARP, as reply would be anyhow ignored
+                               plogx_err("Unable to add ip %d.%d.%d.%d in mac_hash\n", IP4(*ip_dst));
+                               return DROP_MBUF;
                        } else {
                                l3->arp_table[ret].ip = *ip_dst;
-                               l3->arp_table[ret].arp_update_time = tsc + hz;
+                               *time = &l3->arp_table[ret].arp_update_time;
                        }
-                       return -1;
+                       return SEND_ARP;
                } else {
-                       if ((tsc < l3->arp_table[ret].arp_update_time) && (tsc < l3->arp_table[ret].arp_timeout)) {
-                               memcpy(mac, &l3->arp_table[ret].mac, sizeof(struct ether_addr));
-                               return 0;
+                       // IP has been found
+                       if (likely((tsc < l3->arp_table[ret].arp_update_time) && (tsc < l3->arp_table[ret].arp_timeout))) {
+                               // MAC still valid and ARP sent recently
+                               memcpy(mac, &l3->arp_table[ret].mac, sizeof(prox_rte_ether_addr));
+                               return SEND_MBUF;
                        } else if (tsc > l3->arp_table[ret].arp_update_time) {
-                               l3->arp_table[ret].arp_update_time = tsc + hz;
-                               return -1;
+                               // ARP not sent since a long time, send ARP
+                               *time = &l3->arp_table[ret].arp_update_time;
+                               if (tsc < l3->arp_table[ret].arp_timeout) {
+                                       // MAC still valid => send also MBUF
+                                       memcpy(mac, &l3->arp_table[ret].mac, sizeof(prox_rte_ether_addr));
+                                       return SEND_MBUF_AND_ARP;
+                               } else {
+                                       return SEND_ARP;
+                               }
                        } else {
-                               return -2;
+                               return DROP_MBUF;
                        }
                }
        }
-       return 0;
+       // Should not happen
+       return DROP_MBUF;
 }
 
 void task_init_l3(struct task_base *tbase, struct task_args *targ)
@@ -182,17 +232,41 @@ void task_init_l3(struct task_base *tbase, struct task_args *targ)
        tbase->l3.core_id = targ->lconf->id;
        tbase->l3.task_id = targ->id;
        tbase->l3.tmaster = targ->tmaster;
+       tbase->l3.seed = (uint)rte_rdtsc();
+       if (targ->arp_timeout != 0)
+               tbase->l3.arp_timeout = targ->arp_timeout;
+       else
+               tbase->l3.arp_timeout = DEFAULT_ARP_TIMEOUT;
+       if (targ->arp_update_time != 0)
+               tbase->l3.arp_update_time = targ->arp_update_time;
+       else
+               tbase->l3.arp_update_time = DEFAULT_ARP_UPDATE_TIME;
 }
 
 void task_start_l3(struct task_base *tbase, struct task_args *targ)
 {
+       const int NB_ARP_MBUF = 1024;
+       const int ARP_MBUF_SIZE = 2048;
+       const int NB_CACHE_ARP_MBUF = 256;
+
        struct prox_port_cfg *port = find_reachable_port(targ);
-        if (port) {
+        if (port && (tbase->l3.arp_pool == NULL)) {
+               static char name[] = "arp0_pool";
                 tbase->l3.reachable_port_id = port - prox_port_cfg;
                if (targ->local_ipv4) {
                        tbase->local_ipv4 = rte_be_to_cpu_32(targ->local_ipv4);
                        register_ip_to_ctrl_plane(tbase->l3.tmaster, tbase->local_ipv4, tbase->l3.reachable_port_id, targ->lconf->id, targ->id);
                }
+               master_init_vdev(tbase->l3.tmaster, tbase->l3.reachable_port_id, targ->lconf->id, targ->id);
+               name[3]++;
+               struct rte_mempool *ret = rte_mempool_create(name, NB_ARP_MBUF, ARP_MBUF_SIZE, NB_CACHE_ARP_MBUF,
+                       sizeof(struct rte_pktmbuf_pool_private), rte_pktmbuf_pool_init, NULL, rte_pktmbuf_init, 0,
+                       rte_socket_id(), 0);
+               PROX_PANIC(ret == NULL, "Failed to allocate ARP memory pool on socket %u with %u elements\n",
+                       rte_socket_id(), NB_ARP_MBUF);
+               plog_info("\t\tMempool %p (%s) size = %u * %u cache %u, socket %d\n", ret, name, NB_ARP_MBUF,
+                       ARP_MBUF_SIZE, NB_CACHE_ARP_MBUF, rte_socket_id());
+               tbase->l3.arp_pool = ret;
        }
 }
 
@@ -207,6 +281,29 @@ void task_set_local_ip(struct task_base *tbase, uint32_t ip)
        tbase->local_ipv4 = ip;
 }
 
+static void reset_arp_update_time(struct l3_base *l3, uint32_t ip)
+{
+       uint32_t idx;
+       plogx_info("\tMAC entry for IP "IPv4_BYTES_FMT" timeout in kernel\n", IP4(ip));
+       if (ip == l3->gw.ip) {
+               l3->gw.arp_update_time = 0;
+       } else if (l3->n_pkts < 4) {
+               for (idx = 0; idx < l3->n_pkts; idx++) {
+                       uint32_t ip_dst = l3->optimized_arp_table[idx].ip;
+                       if (ip_dst == ip)
+                               break;
+               }
+               if (idx < l3->n_pkts) {
+                       l3->optimized_arp_table[idx].arp_update_time = 0;
+               }
+       } else {
+               int ret = rte_hash_lookup(l3->ip_hash, (const void *)&ip);
+               if (ret >= 0)
+                       l3->arp_table[ret].arp_update_time = 0;
+       }
+       return;
+}
+
 void handle_ctrl_plane_pkts(struct task_base *tbase, struct rte_mbuf **mbufs, uint16_t n_pkts)
 {
        uint8_t out[1];
@@ -217,6 +314,7 @@ void handle_ctrl_plane_pkts(struct task_base *tbase, struct rte_mbuf **mbufs, ui
        struct ether_hdr_arp *hdr;
        struct l3_base *l3 = &tbase->l3;
        uint64_t tsc= rte_rdtsc();
+       uint64_t update_time = l3->arp_timeout * hz / 1000;
 
        for (j = 0; j < n_pkts; ++j) {
                PREFETCH0(mbufs[j]);
@@ -234,11 +332,22 @@ void handle_ctrl_plane_pkts(struct task_base *tbase, struct rte_mbuf **mbufs, ui
                        hdr = rte_pktmbuf_mtod(mbufs[j], struct ether_hdr_arp *);
                        ip = (mbufs[j]->udata64 >> 32) & 0xFFFFFFFF;
 
+                       if (prox_rte_is_zero_ether_addr(&hdr->arp.data.sha)) {
+                               // MAC timeout or deleted from kernel table => reset update_time
+                               // This will cause us to send new ARP request
+                               // However, as arp_timeout not touched, we should continue sending our regular IP packets
+                               reset_arp_update_time(l3, ip);
+                               plogx_info("\tTimeout for MAC entry for IP "IPv4_BYTES_FMT"\n", IP4(ip));
+                               return;
+                       } else
+                               plogx_dbg("\tUpdating MAC entry for IP "IPv4_BYTES_FMT" with MAC "MAC_BYTES_FMT"\n",
+                                       IP4(ip), MAC_BYTES(hdr->arp.data.sha.addr_bytes));
                        if (ip == l3->gw.ip) {
                                // MAC address of the gateway
                                memcpy(&l3->gw.mac, &hdr->arp.data.sha, 6);
                                l3->flags |= FLAG_DST_MAC_KNOWN;
-                               l3->gw.arp_timeout = tsc + 30 * hz;
+                               l3->gw.arp_timeout = tsc + update_time;
+                               update_arp_update_time(l3, &l3->gw.arp_update_time, l3->arp_update_time);
                        } else if (l3->n_pkts < 4) {
                                // Few packets tracked - should be faster to loop through them thean using a hash table
                                for (idx = 0; idx < l3->n_pkts; idx++) {
@@ -247,26 +356,31 @@ void handle_ctrl_plane_pkts(struct task_base *tbase, struct rte_mbuf **mbufs, ui
                                                break;
                                }
                                if (idx < l3->n_pkts) {
-                                       // IP not found; this is a reply while we never asked for the request!
-                                       memcpy(&l3->optimized_arp_table[idx].mac, &(hdr->arp.data.sha), sizeof(struct ether_addr));
-                                       l3->optimized_arp_table[idx].arp_timeout = tsc + 30 * hz;
+                                       memcpy(&l3->optimized_arp_table[idx].mac, &(hdr->arp.data.sha), sizeof(prox_rte_ether_addr));
+                                       l3->optimized_arp_table[idx].arp_timeout = tsc + update_time;
+                                       update_arp_update_time(l3, &l3->optimized_arp_table[idx].arp_update_time, l3->arp_update_time);
                                }
                        } else {
                                int ret = rte_hash_add_key(l3->ip_hash, (const void *)&ip);
                                if (ret < 0) {
                                        plogx_info("Unable add ip %d.%d.%d.%d in mac_hash\n", IP4(ip));
                                } else {
-                                       memcpy(&l3->arp_table[ret].mac, &(hdr->arp.data.sha), sizeof(struct ether_addr));
-                                       l3->arp_table[ret].arp_timeout = tsc + 30 * hz;
+                                       memcpy(&l3->arp_table[ret].mac, &(hdr->arp.data.sha), sizeof(prox_rte_ether_addr));
+                                       l3->arp_table[ret].arp_timeout = tsc + update_time;
+                                       update_arp_update_time(l3, &l3->arp_table[ret].arp_update_time, l3->arp_update_time);
                                }
                        }
                        tx_drop(mbufs[j]);
                        break;
                case ARP_REPLY_FROM_CTRL:
+               case ICMP_FROM_CTRL:
                case ARP_REQ_FROM_CTRL:
-                       TASK_STATS_ADD_TX_NON_DP(&tbase->aux->stats, 1);
+               case PKT_FROM_TAP:
                        out[0] = 0;
-                       tbase->aux->tx_pkt_l2(tbase, &mbufs[j], 1, out);
+                       // tx_ctrlplane_pkt does not drop packets
+                       plogx_dbg("\tForwarding (ARP/PING) packet from master\n");
+                       tbase->aux->tx_ctrlplane_pkt(tbase, &mbufs[j], 1, out);
+                       TASK_STATS_ADD_TX_NON_DP(&tbase->aux->stats, 1);
                        break;
                }
        }