Added initial support for BGP 28/70128/5
authorXavier Simonart <xavier.simonart@intel.com>
Sat, 2 May 2020 19:51:24 +0000 (21:51 +0200)
committerXavier Simonart <xavier.simonart@intel.com>
Fri, 29 May 2020 21:31:54 +0000 (23:31 +0200)
Through this commit BGP messages are forwarded to tap device
Netlink messages are enabled to receive route Updates.

In addition, generating tasks can also specify a routing table
which will be used when sending packets

The routes initialized by the routing table can be changed through
the reception of BGP messages

Change-Id: I187ba9a921885cbc9b209aae5fb654309e3388b8
Signed-off-by: Xavier Simonart <xavier.simonart@intel.com>
13 files changed:
VNFs/DPPD-PROX/gen/gen_tap.cfg [moved from VNFs/DPPD-PROX/gen_tap.cfg with 85% similarity]
VNFs/DPPD-PROX/gen/l3-ipv4.lua [new file with mode: 0644]
VNFs/DPPD-PROX/handle_master.c
VNFs/DPPD-PROX/handle_master.h
VNFs/DPPD-PROX/packet_utils.c
VNFs/DPPD-PROX/packet_utils.h
VNFs/DPPD-PROX/prox_args.c
VNFs/DPPD-PROX/prox_compat.h
VNFs/DPPD-PROX/prox_lua_types.c
VNFs/DPPD-PROX/rx_pkt.c
VNFs/DPPD-PROX/task_init.h
VNFs/DPPD-PROX/tx_pkt.c
VNFs/DPPD-PROX/tx_pkt.h

similarity index 85%
rename from VNFs/DPPD-PROX/gen_tap.cfg
rename to VNFs/DPPD-PROX/gen/gen_tap.cfg
index fd74672..6023968 100644 (file)
@@ -18,6 +18,9 @@
 -n=4 ; force number of memory channels
 no-output=no ; disable DPDK debug output
 
+[lua]
+lpm4 = dofile("l3-ipv4.lua")
+
 [port 0]
 name=p0
 vdev=gen_tap
@@ -31,28 +34,30 @@ start time=5
 name=Basic Gen
 
 [variables]
-$hex_ip1=0a 0a 0a 01
-$hex_ip2=0a 0a 0a 02
-$ip1=10.10.10.1
-$ip2=10.10.10.2
+$hex_ip1=c0 a8 7a 7e
+$hex_ip2=c0 a8 7b 7f
+$ip1=192.168.122.126
+$ip2=192.168.123.127
 
 [core 0s0]
 mode=master
 
-[core 1s0]
+[core 1]
 name=p0
 task=0
 mode=gen
 sub mode=l3
 tx port=p0
+route table=lpm4
 bps=1250000000
 pkt inline=00 00 01 00 00 01 00 00 02 00 00 02 08 00 45 00 00 1c 00 01 00 00 40 11 f7 7d ${hex_ip1} ${hex_ip2} 13 88 13 88 00 08 55 7b
 pkt size=60
 lat pos=42
 packet id pos=46
 min bulk size=8
+local ipv4=${ip1}/24
 
-[core 2s0]
+[core 2]
 name=nop
 task=0
 mode=lat
@@ -61,4 +66,4 @@ rx port=p0
 drop=no
 lat pos=42
 packet id pos=46
-local ipv4=${ip1}
+local ipv4=${ip1}/24
diff --git a/VNFs/DPPD-PROX/gen/l3-ipv4.lua b/VNFs/DPPD-PROX/gen/l3-ipv4.lua
new file mode 100644 (file)
index 0000000..1c98834
--- /dev/null
@@ -0,0 +1,29 @@
+--
+-- Copyright (c) 2010-2017 Intel Corporation
+--
+-- Licensed under the Apache License, Version 2.0 (the "License");
+-- you may not use this file except in compliance with the License.
+-- You may obtain a copy of the License at
+--
+--     http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing, software
+-- distributed under the License is distributed on an "AS IS" BASIS,
+-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-- See the License for the specific language governing permissions and
+-- limitations under the License.
+--
+
+local lpm4 = {}
+lpm4.next_hops = {
+   {id = 0,  port_id = 0, ip = ip("192.168.122.240")},
+   {id = 1,  port_id = 0, ip = ip("192.168.122.246")},
+   {id = 2,  port_id = 0, ip = ip("192.168.122.247")}
+}
+
+lpm4.routes = {
+   {cidr = {ip = ip("192.168.123.0"), depth = 24}, next_hop_id = 0},
+   {cidr = {ip = ip("192.168.124.0"), depth = 24}, next_hop_id = 1},
+   {cidr = {ip = ip("192.168.125.0"), depth = 24}, next_hop_id = 2},
+}
+return lpm4
index b6b123c..263f0c8 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/netlink.h>
 #include <linux/rtnetlink.h>
 #include <poll.h>
+#include <net/if.h>
 
 #include <rte_hash.h>
 #include <rte_hash_crc.h>
@@ -51,11 +52,15 @@ static char netlink_buf[NETLINK_BUF_SIZE];
 
 const char *actions_string[] = {
        "UPDATE_FROM_CTRL",             // Controlplane sending a MAC update to dataplane
+       "ROUTE_ADD_FROM_CTRL",          // Controlplane sending a new route to dataplane
+       "ROUTE_DEL_FROM_CTRL",          // Controlplane deleting a new route from dataplane
        "SEND_ARP_REQUEST_FROM_CTRL",   // Controlplane requesting dataplane to send ARP request
        "SEND_ARP_REPLY_FROM_CTRL",     // Controlplane requesting dataplane to send ARP reply
        "SEND_ICMP_FROM_CTRL",          // Controlplane requesting dataplane to send ICMP message
+       "SEND_BGP_FROM_CTRL",           // Controlplane requesting dataplane to send BGP message
        "ARP_TO_CTRL",                  // ARP sent by datplane to Controlpane for handling
        "ICMP_TO_CTRL",                 // ICMP sent by datplane to Controlpane for handling
+       "BGP_TO_CTRL",                  // BGP sent by datplane to Controlpane for handling
        "REQ_MAC_TO_CTRL",              // Dataplane requesting MAC resolution to Controlplane
        "PKT_FROM_TAP"                  // Packet received by Controlplane from kernel and forwarded to dataplane for sending
 };
@@ -110,6 +115,7 @@ struct task_master {
        struct vdev all_vdev[PROX_MAX_PORTS];
        int max_vdev_id;
        struct pollfd arp_fds;
+       struct pollfd route_fds;
 };
 
 struct ip_port {
@@ -278,7 +284,6 @@ static inline int record_request(struct task_base *tbase, uint32_t ip_dst, uint8
        int i;
 
        if (unlikely(ret < 0)) {
-               // entry not found for this IP: delete the reply
                plogx_dbg("Unable to add IP "IPv4_BYTES_FMT" in external_ip_hash\n", IP4(ip_dst));
                return -1;
        }
@@ -417,6 +422,16 @@ static inline void handle_message(struct task_base *tbase, struct rte_mbuf *mbuf
        plogx_dbg("\tMaster received %s (%x) from mbuf %p\n", actions_string[command], command, mbuf);
 
        switch(command) {
+       case BGP_TO_CTRL:
+               if (vdev_port != NO_VDEV_PORT) {
+                       // If a virtual (net_tap) device is attached, send the (BGP) packet to this device
+                       // The kernel will receive and handle it.
+                       plogx_dbg("\tMaster forwarding BGP packet to TAP\n");
+                       int n = rte_eth_tx_burst(prox_port_cfg[port].dpdk_mapping, 0, &mbuf, 1);
+                       return;
+               }
+               tx_drop(mbuf);
+               break;
        case ICMP_TO_CTRL:
                if (vdev_port != NO_VDEV_PORT) {
                        // If a virtual (net_tap) device is attached, send the (PING) packet to this device
@@ -545,6 +560,20 @@ void init_ctrl_plane(struct task_base *tbase)
        task->arp_fds.fd = fd;
        task->arp_fds.events = POLL_IN;
        plog_info("\tRTMGRP_NEIGH netlink group bound; fd = %d\n", fd);
+
+       fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+       PROX_PANIC(fd < 0, "Failed to open netlink socket: %d\n", errno);
+       fcntl(fd, F_SETFL, fcntl(fd, F_GETFL) | O_NONBLOCK);
+       struct sockaddr_nl sockaddr2;
+       memset(&sockaddr2, 0, sizeof(struct sockaddr_nl));
+       sockaddr2.nl_family = AF_NETLINK;
+       sockaddr2.nl_groups = RTMGRP_IPV6_ROUTE | RTMGRP_IPV4_ROUTE | RTMGRP_NOTIFY;
+       rc = bind(fd, (struct sockaddr *)&sockaddr2, sizeof(struct sockaddr_nl));
+       PROX_PANIC(rc < 0, "Failed to bind to RTMGRP_NEIGH netlink group\n");
+       task->route_fds.fd = fd;
+       task->route_fds.events = POLL_IN;
+       plog_info("\tRTMGRP_IPV4_ROUTE netlink group bound; fd = %d\n", fd);
+
        static char name[] = "master_arp_pool";
        const int NB_ARP_MBUF = 1024;
        const int ARP_MBUF_SIZE = 2048;
@@ -559,6 +588,161 @@ void init_ctrl_plane(struct task_base *tbase)
        tbase->l3.arp_pool = ret;
 }
 
+static void handle_route_event(struct task_base *tbase)
+{
+       struct task_master *task = (struct task_master *)tbase;
+       struct rte_mbuf *mbufs[MAX_RING_BURST];
+       int fd = task->route_fds.fd, interface_index, mask = -1;
+       char interface_name[IF_NAMESIZE] = {0};
+       int len = recv(fd, netlink_buf, sizeof(netlink_buf), 0);
+       uint32_t ip = 0, gw_ip = 0;
+       if (len < 0) {
+               plog_err("Failed to recv from netlink: %d\n", errno);
+               return;
+       }
+       struct nlmsghdr * nl_hdr = (struct nlmsghdr *)netlink_buf;
+       if (nl_hdr->nlmsg_flags & NLM_F_MULTI) {
+               plog_err("Unexpected multipart netlink message\n");
+               return;
+       }
+       if ((nl_hdr->nlmsg_type != RTM_NEWROUTE) && (nl_hdr->nlmsg_type != RTM_DELROUTE))
+               return;
+
+       struct rtmsg *rtmsg = (struct rtmsg *)NLMSG_DATA(nl_hdr);
+       int rtm_family = rtmsg->rtm_family;
+       if ((rtm_family == AF_INET) && (rtmsg->rtm_table != RT_TABLE_MAIN) &&(rtmsg->rtm_table != RT_TABLE_LOCAL))
+               return;
+       int dst_len = rtmsg->rtm_dst_len;
+
+       struct rtattr *rta = (struct rtattr *)RTM_RTA(rtmsg);
+       int rtl = RTM_PAYLOAD(nl_hdr);
+       for (; RTA_OK(rta, rtl); rta = RTA_NEXT(rta, rtl)) {
+               switch (rta->rta_type) {
+               case RTA_DST:
+                       ip = *((uint32_t *)RTA_DATA(rta));
+                       break;
+               case RTA_OIF:
+                       interface_index = *((int *)RTA_DATA(rta));
+                       if (if_indextoname(interface_index, interface_name) == NULL) {
+                               plog_info("Unknown Interface Index %d\n", interface_index);
+                       }
+                       break;
+               case RTA_METRICS:
+                       mask = *((int *)RTA_DATA(rta));
+                       break;
+               case RTA_GATEWAY:
+                       gw_ip = *((uint32_t *)RTA_DATA(rta));
+                       break;
+               default:
+                       break;
+               }
+       }
+       int dpdk_vdev_port = -1;
+       for (int i = 0; i< rte_eth_dev_count(); i++) {
+               if (strcmp(prox_port_cfg[i].name, interface_name) == 0)
+                       dpdk_vdev_port = i;
+       }
+       if (dpdk_vdev_port != -1) {
+               plogx_info("Received netlink message on tap interface %s for IP "IPv4_BYTES_FMT"/%d, Gateway  "IPv4_BYTES_FMT"\n", interface_name, IP4(ip), dst_len, IP4(gw_ip));
+               int ret1 = rte_mempool_get(tbase->l3.arp_pool, (void **)mbufs);
+               if (unlikely(ret1 != 0)) {
+                       plog_err("Unable to allocate a mbuf for master to core communication\n");
+                       return;
+               }
+               int dpdk_port = prox_port_cfg[dpdk_vdev_port].dpdk_mapping;
+               tx_ring_route(tbase, task->internal_port_table[dpdk_port].ring, (nl_hdr->nlmsg_type == RTM_NEWROUTE), mbufs[0], ip, gw_ip, dst_len);
+       } else
+               plog_info("Received netlink message on unknown interface %s for IP "IPv4_BYTES_FMT"/%d, Gateway  "IPv4_BYTES_FMT"\n", interface_name[0] ? interface_name:"", IP4(ip), dst_len, IP4(gw_ip));
+       return;
+}
+
+static void handle_arp_event(struct task_base *tbase)
+{
+       struct task_master *task = (struct task_master *)tbase;
+       struct rte_mbuf *mbufs[MAX_RING_BURST];
+       struct nlmsghdr * nl_hdr;
+       int fd = task->arp_fds.fd;
+       int len, ret;
+       uint32_t ip = 0;
+       prox_rte_ether_addr mac;
+       memset(&mac, 0, sizeof(mac));
+       len = recv(fd, netlink_buf, sizeof(netlink_buf), 0);
+       if (len < 0) {
+               plog_err("Failed to recv from netlink: %d\n", errno);
+               return;
+       }
+       nl_hdr = (struct nlmsghdr *)netlink_buf;
+       if (nl_hdr->nlmsg_flags & NLM_F_MULTI) {
+               plog_err("Unexpected multipart netlink message\n");
+               return;
+       }
+       if ((nl_hdr->nlmsg_type != RTM_NEWNEIGH) && (nl_hdr->nlmsg_type != RTM_DELNEIGH))
+               return;
+
+       struct ndmsg *ndmsg = (struct ndmsg *)NLMSG_DATA(nl_hdr);
+       int ndm_family = ndmsg->ndm_family;
+       struct rtattr *rta = (struct rtattr *)RTM_RTA(ndmsg);
+       int rtl = RTM_PAYLOAD(nl_hdr);
+       for (; RTA_OK(rta, rtl); rta = RTA_NEXT(rta, rtl)) {
+               switch (rta->rta_type) {
+               case NDA_DST:
+                       ip = *((uint32_t *)RTA_DATA(rta));
+                       break;
+               case NDA_LLADDR:
+                       mac = *((prox_rte_ether_addr *)(uint64_t *)RTA_DATA(rta));
+                       break;
+               default:
+                       break;
+               }
+       }
+       plogx_info("Received netlink ip "IPv4_BYTES_FMT" with mac "MAC_BYTES_FMT"\n", IP4(ip), MAC_BYTES(mac.addr_bytes));
+       ret = rte_hash_lookup(task->external_ip_hash, (const void *)&ip);
+       if (unlikely(ret < 0)) {
+               // entry not found for this IP: we did not ask a request.
+               // This can happen if the kernel updated the ARP table when receiving an ARP_REQUEST
+               // We must record this, as the ARP entry is now in the kernel table
+               if (prox_rte_is_zero_ether_addr(&mac)) {
+                       // Timeout or MAC deleted from kernel MAC table
+                       int ret = rte_hash_del_key(task->external_ip_hash, (const void *)&ip);
+                       plogx_dbg("ip "IPv4_BYTES_FMT" removed from external_ip_hash\n", IP4(ip));
+                       return;
+               }
+               int ret = rte_hash_add_key(task->external_ip_hash, (const void *)&ip);
+               if (unlikely(ret < 0)) {
+                       plogx_dbg("IP "IPv4_BYTES_FMT" not found in external_ip_hash and unable to add it\n", IP4(ip));
+                       return;
+               }
+               memcpy(&task->external_ip_table[ret].mac, &mac, sizeof(prox_rte_ether_addr));
+               plogx_dbg("ip "IPv4_BYTES_FMT" added in external_ip_hash with mac "MAC_BYTES_FMT"\n", IP4(ip), MAC_BYTES(mac.addr_bytes));
+               return;
+       }
+
+       // entry found for this IP
+       uint16_t nb_requests = task->external_ip_table[ret].nb_requests;
+       if (nb_requests == 0) {
+               return;
+       }
+
+       memcpy(&task->external_ip_table[ret].mac, &mac, sizeof(prox_rte_ether_addr));
+
+       // If we receive a request from multiple task for the same IP, then we update all tasks
+       int ret1 = rte_mempool_get(tbase->l3.arp_pool, (void **)mbufs);
+       if (unlikely(ret1 != 0)) {
+               plog_err("Unable to allocate a mbuf for master to core communication\n");
+               return;
+       }
+       rte_mbuf_refcnt_set(mbufs[0], nb_requests);
+       for (int i = 0; i < nb_requests; i++) {
+               struct rte_ring *ring = task->external_ip_table[ret].rings[i];
+               struct ether_hdr_arp *hdr = rte_pktmbuf_mtod(mbufs[0], struct ether_hdr_arp *);
+               memcpy(&hdr->arp.data.sha, &mac, sizeof(prox_rte_ether_addr));
+               tx_ring_ip(tbase, ring, UPDATE_FROM_CTRL, mbufs[0], ip);
+               plog_dbg("UPDATE_FROM_CTRL ip "IPv4_BYTES_FMT" with mac "MAC_BYTES_FMT"\n", IP4(ip), MAC_BYTES(mac.addr_bytes));
+       }
+       task->external_ip_table[ret].nb_requests = 0;
+       return;
+}
+
 static int handle_ctrl_plane_f(struct task_base *tbase, __attribute__((unused)) struct rte_mbuf **mbuf, uint16_t n_pkts)
 {
        int ring_id = 0, j, ret = 0, n = 0;
@@ -583,86 +767,10 @@ static int handle_ctrl_plane_f(struct task_base *tbase, __attribute__((unused))
                ret +=n;
        }
        if ((task->max_vdev_id) && (poll(&task->arp_fds, 1, prox_cfg.poll_timeout) == POLL_IN)) {
-               struct nlmsghdr * nl_hdr;
-               int fd = task->arp_fds.fd;
-               int len;
-               uint32_t ip = 0;
-               prox_rte_ether_addr mac;
-               memset(&mac, 0, sizeof(mac));
-               len = recv(fd, netlink_buf, sizeof(netlink_buf), 0);
-               if (len < 0) {
-                       plog_err("Failed to recv from netlink: %d\n", errno);
-                       return ret;
-               }
-               nl_hdr = (struct nlmsghdr *)netlink_buf;
-               if (nl_hdr->nlmsg_flags & NLM_F_MULTI) {
-                       plog_err("Unexpected multipart netlink message\n");
-                       return ret;
-               }
-               if ((nl_hdr->nlmsg_type != RTM_NEWNEIGH) && (nl_hdr->nlmsg_type != RTM_DELNEIGH))
-                       return ret;
-
-               struct ndmsg *ndmsg = (struct ndmsg *)NLMSG_DATA(nl_hdr);
-               int ndm_family = ndmsg->ndm_family;
-               struct rtattr *rta = (struct rtattr *)RTM_RTA(ndmsg);
-               int rtl = RTM_PAYLOAD(nl_hdr);
-               for (; RTA_OK(rta, rtl); rta = RTA_NEXT(rta, rtl)) {
-                       switch (rta->rta_type) {
-                       case NDA_DST:
-                               ip = *((uint32_t *)RTA_DATA(rta));
-                               break;
-                       case NDA_LLADDR:
-                               mac = *((prox_rte_ether_addr *)(uint64_t *)RTA_DATA(rta));
-                               break;
-                       default:
-                               break;
-                       }
-               }
-               int idx = rte_hash_lookup(task->external_ip_hash, (const void *)&ip);
-               if (unlikely(idx < 0)) {
-                       // entry not found for this IP: we did not ask a request.
-                       // This can happen if the kernel updated the ARP table when receiving an ARP_REQUEST
-                       // We must record this, as the ARP entry is now in the kernel table
-                       if (prox_rte_is_zero_ether_addr(&mac)) {
-                               // Timeout or MAC deleted from kernel MAC table
-                               idx = rte_hash_del_key(task->external_ip_hash, (const void *)&ip);
-                               plogx_dbg("ip "IPv4_BYTES_FMT" removed from external_ip_hash\n", IP4(ip));
-                               return ret;
-                       }
-                       idx = rte_hash_add_key(task->external_ip_hash, (const void *)&ip);
-                       if (unlikely(idx < 0)) {
-                               // entry not found for this IP: Ignore the reply. This can happen for instance for
-                               // an IP used by management plane.
-                               plogx_dbg("IP "IPv4_BYTES_FMT" not found in external_ip_hash and unable to add it\n", IP4(ip));
-                               return ret;
-                       }
-                       memcpy(&task->external_ip_table[idx].mac, &mac, sizeof(prox_rte_ether_addr));
-                       plogx_dbg("ip "IPv4_BYTES_FMT" added in external_ip_hash with mac "MAC_BYTES_FMT"\n", IP4(ip), MAC_BYTES(mac.addr_bytes));
-                       return ret;
-               }
-
-               // entry found for this IP
-               uint16_t nb_requests = task->external_ip_table[idx].nb_requests;
-               if (nb_requests == 0) {
-                       return ret;
-               }
-
-               memcpy(&task->external_ip_table[idx].mac, &mac, sizeof(prox_rte_ether_addr));
-
-               // If we receive a request from multiple task for the same IP, then we update all tasks
-               if (unlikely(rte_mempool_get(tbase->l3.arp_pool, (void **)mbufs) != 0)) {
-                       plog_err("Unable to allocate a mbuf for master to core communication\n");
-                       return ret;
-               }
-               rte_mbuf_refcnt_set(mbufs[0], nb_requests);
-               for (int i = 0; i < nb_requests; i++) {
-                       struct rte_ring *ring = task->external_ip_table[idx].rings[i];
-                       struct ether_hdr_arp *hdr = rte_pktmbuf_mtod(mbufs[0], struct ether_hdr_arp *);
-                       memcpy(&hdr->arp.data.sha, &mac, sizeof(prox_rte_ether_addr));
-                       tx_ring_ip(tbase, ring, UPDATE_FROM_CTRL, mbufs[0], ip);
-                       plog_dbg("UPDATE_FROM_CTRL ip "IPv4_BYTES_FMT" with mac "MAC_BYTES_FMT"\n", IP4(ip), MAC_BYTES(mac.addr_bytes));
-               }
-               task->external_ip_table[idx].nb_requests = 0;
+               handle_arp_event(tbase);
+       }
+       if (poll(&task->route_fds, 1, prox_cfg.poll_timeout) == POLL_IN) {
+               handle_route_event(tbase);
        }
        return ret;
 }
index 6ce5185..7915445 100644 (file)
 
 enum arp_actions {
        UPDATE_FROM_CTRL,
+       ROUTE_ADD_FROM_CTRL,
+       ROUTE_DEL_FROM_CTRL,
        ARP_REQ_FROM_CTRL,
        ARP_REPLY_FROM_CTRL,
        ICMP_FROM_CTRL,
+       BGP_FROM_CTRL,
        ARP_TO_CTRL,
        ICMP_TO_CTRL,
+       BGP_TO_CTRL,
        REQ_MAC_TO_CTRL,
        PKT_FROM_TAP,
        MAX_ACTIONS
index e06529c..0474613 100644 (file)
@@ -17,6 +17,8 @@
 #include <rte_lcore.h>
 #include <rte_hash.h>
 #include <rte_hash_crc.h>
+#include <rte_lpm.h>
+
 #include "task_base.h"
 #include "lconf.h"
 #include "prefetch.h"
 #include "handle_master.h"
 #include "prox_port_cfg.h"
 #include "packet_utils.h"
+#include "prox_shared.h"
+#include "prox_lua.h"
+#include "hash_entry_types.h"
+#include "prox_compat.h"
+#include "tx_pkt.h"
 
 static inline int find_ip(struct ether_hdr_arp *pkt, uint16_t len, uint32_t *ip_dst)
 {
@@ -74,17 +81,92 @@ static inline int find_ip(struct ether_hdr_arp *pkt, uint16_t len, uint32_t *ip_
    We should check arp_update_time in the master process. This would also require the generating task to clear its arp ring
    to avoid sending many ARP while starting after a long stop.
    We could also check for arp_timeout in the master so that dataplane has only to check whether MAC is available
-   but this would require either thread safety, or the the exchange of information between master and generating core.
+   but this would require either thread safety, or the exchange of information between master and generating core.
 */
 
+static inline int add_key_and_send_arp(struct rte_hash *ip_hash, uint32_t *ip_dst, struct arp_table *entries,  uint64_t tsc, uint64_t hz, uint32_t arp_update_time, prox_next_hop_index_type nh, uint64_t **time)
+{
+       int ret = rte_hash_add_key(ip_hash, (const void *)ip_dst);
+       if (unlikely(ret < 0)) {
+               // No reason to send ARP, as reply would be anyhow ignored
+               plogx_err("Unable to add ip "IPv4_BYTES_FMT" in mac_hash\n", IP4(*ip_dst));
+               return DROP_MBUF;
+       } else {
+               entries[ret].ip = *ip_dst;
+               entries[ret].nh = nh;
+               *time = &entries[ret].arp_update_time;
+       }
+       return SEND_ARP;
+}
+
+static inline int update_mac_and_send_mbuf(struct arp_table *entry, prox_rte_ether_addr *mac, uint64_t tsc, uint64_t hz, uint32_t arp_update_time, uint64_t **time)
+{
+       if (likely((tsc < entry->arp_update_time) && (tsc < entry->arp_timeout))) {
+               memcpy(mac, &entry->mac, sizeof(prox_rte_ether_addr));
+               return SEND_MBUF;
+       } else if (tsc > entry->arp_update_time) {
+               // long time since we have sent an arp, send arp
+               *time = &entry->arp_update_time;
+               if (tsc < entry->arp_timeout){
+                       // MAC is valid in the table => send also the mbuf
+                       memcpy(mac, &entry->mac, sizeof(prox_rte_ether_addr));
+                       return SEND_MBUF_AND_ARP;
+               } else {
+                       // MAC still unknown, or timed out => only send ARP
+                       return SEND_ARP;
+               }
+       }
+       // MAC is unknown and we already sent an ARP recently, drop mbuf and wait for ARP reply
+       return DROP_MBUF;
+}
+
 int write_dst_mac(struct task_base *tbase, struct rte_mbuf *mbuf, uint32_t *ip_dst, uint64_t **time)
 {
        const uint64_t hz = rte_get_tsc_hz();
        struct ether_hdr_arp *packet = rte_pktmbuf_mtod(mbuf, struct ether_hdr_arp *);
        prox_rte_ether_addr *mac = &packet->ether_hdr.d_addr;
+       prox_next_hop_index_type next_hop_index;
 
        uint64_t tsc = rte_rdtsc();
        struct l3_base *l3 = &(tbase->l3);
+
+       // First find the next hop
+       if (l3->ipv4_lpm) {
+               // A routing table was configured
+               // If a gw (gateway_ipv4) is also specified, it is used as default gw only i.e. lowest priority (shortest prefix)
+               // This is implemented automatically through lpm
+               uint16_t len = rte_pktmbuf_pkt_len(mbuf);
+               if (find_ip(packet, len, ip_dst) != 0) {
+                       // Unable to find IP address => non IP packet => send it as it
+                       return SEND_MBUF;
+               }
+               if (unlikely(rte_lpm_lookup(l3->ipv4_lpm, rte_bswap32(*ip_dst), &next_hop_index) != 0)) {
+                       plog_err("No route to IP "IPv4_BYTES_FMT"\n", IP4(*ip_dst));
+                       return DROP_MBUF;
+               }
+               struct arp_table *entry = &l3->next_hops[next_hop_index];
+
+               if (entry->ip) {
+                       *ip_dst = entry->ip;
+               } else {
+                       // no next ip: this is a local route
+                       next_hop_index = MAX_HOP_INDEX;
+               }
+               // Find IP in lookup table. Send ARP if not found
+               int ret = rte_hash_lookup(l3->ip_hash, (const void *)ip_dst);
+               if (unlikely(ret < 0)) {
+                       // IP not found, try to send an ARP
+                       return add_key_and_send_arp(l3->ip_hash, ip_dst, l3->arp_table, tsc, hz, l3->arp_update_time, next_hop_index, time);
+               } else {
+                       if (entry->ip)
+                               return update_mac_and_send_mbuf(entry, mac, tsc, hz, l3->arp_update_time, time);
+                       else
+                               return update_mac_and_send_mbuf(&l3->arp_table[ret], mac, tsc, hz, l3->arp_update_time, time);
+               }
+               return 0;
+       }
+       // No Routing table specified: only a local ip and maybe a gateway
+       // Old default behavior: if a gw is specified, ALL packets go to this gateway (even those we could send w/o the gw
        if (l3->gw.ip) {
                if (likely((l3->flags & FLAG_DST_MAC_KNOWN) && (tsc < l3->gw.arp_update_time) && (tsc < l3->gw.arp_timeout))) {
                        memcpy(mac, &l3->gw.mac, sizeof(prox_rte_ether_addr));
@@ -117,25 +199,7 @@ int write_dst_mac(struct task_base *tbase, struct rte_mbuf *mbuf, uint32_t *ip_d
                for (unsigned int idx = 0; idx < l3->n_pkts; idx++) {
                        if (*ip_dst == l3->optimized_arp_table[idx].ip) {
                                 // IP address already in table
-                               if ((tsc < l3->optimized_arp_table[idx].arp_update_time) && (tsc < l3->optimized_arp_table[idx].arp_timeout)) {
-                                       // MAC address was recently updated in table, use it
-                                       memcpy(mac, &l3->optimized_arp_table[idx].mac, sizeof(prox_rte_ether_addr));
-                                       return SEND_MBUF;
-                               } else if (tsc > l3->optimized_arp_table[idx].arp_update_time) {
-                                       // ARP not sent since a long time, send ARP
-                                               *time = &l3->optimized_arp_table[idx].arp_update_time;
-                                       if (tsc < l3->optimized_arp_table[idx].arp_timeout) {
-                                               // MAC still valid => also send mbuf
-                                               memcpy(mac, &l3->optimized_arp_table[idx].mac, sizeof(prox_rte_ether_addr));
-                                               return SEND_MBUF_AND_ARP;
-                                       } else {
-                                               // MAC unvalid => only send ARP
-                                               return SEND_ARP;
-                                       }
-                               } else {
-                                       //  ARP timeout elapsed, MAC not valid anymore but waiting for ARP reply
-                                       return DROP_MBUF;
-                               }
+                               return update_mac_and_send_mbuf(&l3->optimized_arp_table[idx], mac, tsc, hz, l3->arp_update_time, time);
                        }
                }
                // IP address not found in table
@@ -156,7 +220,7 @@ int write_dst_mac(struct task_base *tbase, struct rte_mbuf *mbuf, uint32_t *ip_d
                                // If it happens, we still send the ARP as easier:
                                //      If the ARP corresponds to this error, the ARP reply will be ignored
                                //      If ARP does not correspond to this error/ip, then ARP reply will be handled.
-                               plogx_err("Unable add ip %d.%d.%d.%d in mac_hash (already %d entries)\n", IP4(ip), idx);
+                               plogx_err("Unable add ip "IPv4_BYTES_FMT" in mac_hash (already %d entries)\n", IP4(ip), idx);
                        } else {
                                memcpy(&l3->arp_table[ret], &l3->optimized_arp_table[idx], sizeof(struct arp_table));
                        }
@@ -167,35 +231,10 @@ int write_dst_mac(struct task_base *tbase, struct rte_mbuf *mbuf, uint32_t *ip_d
                int ret = rte_hash_lookup(l3->ip_hash, (const void *)ip_dst);
                if (unlikely(ret < 0)) {
                        // IP not found, try to send an ARP
-                       int ret = rte_hash_add_key(l3->ip_hash, (const void *)ip_dst);
-                       if (ret < 0) {
-                               // No reason to send ARP, as reply would be anyhow ignored
-                               plogx_err("Unable to add ip %d.%d.%d.%d in mac_hash\n", IP4(*ip_dst));
-                               return DROP_MBUF;
-                       } else {
-                               l3->arp_table[ret].ip = *ip_dst;
-                               *time = &l3->arp_table[ret].arp_update_time;
-                       }
-                       return SEND_ARP;
+                       return add_key_and_send_arp(l3->ip_hash, ip_dst, &l3->arp_table[ret], tsc, hz, l3->arp_update_time, MAX_HOP_INDEX, time);
                } else {
                        // IP has been found
-                       if (likely((tsc < l3->arp_table[ret].arp_update_time) && (tsc < l3->arp_table[ret].arp_timeout))) {
-                               // MAC still valid and ARP sent recently
-                               memcpy(mac, &l3->arp_table[ret].mac, sizeof(prox_rte_ether_addr));
-                               return SEND_MBUF;
-                       } else if (tsc > l3->arp_table[ret].arp_update_time) {
-                               // ARP not sent since a long time, send ARP
-                               *time = &l3->arp_table[ret].arp_update_time;
-                               if (tsc < l3->arp_table[ret].arp_timeout) {
-                                       // MAC still valid => send also MBUF
-                                       memcpy(mac, &l3->arp_table[ret].mac, sizeof(prox_rte_ether_addr));
-                                       return SEND_MBUF_AND_ARP;
-                               } else {
-                                       return SEND_ARP;
-                               }
-                       } else {
-                               return DROP_MBUF;
-                       }
+                       return update_mac_and_send_mbuf(&l3->arp_table[ret], mac, tsc, hz, l3->arp_update_time, time);
                }
        }
        // Should not happen
@@ -248,6 +287,7 @@ void task_start_l3(struct task_base *tbase, struct task_args *targ)
        const int NB_ARP_MBUF = 1024;
        const int ARP_MBUF_SIZE = 2048;
        const int NB_CACHE_ARP_MBUF = 256;
+       const int socket_id = rte_lcore_to_socket_id(targ->lconf->id);
 
        struct prox_port_cfg *port = find_reachable_port(targ);
         if (port && (tbase->l3.arp_pool == NULL)) {
@@ -257,6 +297,45 @@ void task_start_l3(struct task_base *tbase, struct task_args *targ)
                        tbase->local_ipv4 = rte_be_to_cpu_32(targ->local_ipv4);
                        register_ip_to_ctrl_plane(tbase->l3.tmaster, tbase->local_ipv4, tbase->l3.reachable_port_id, targ->lconf->id, targ->id);
                }
+               if (strcmp(targ->route_table, "") != 0) {
+                       struct lpm4 *lpm;
+                       int ret;
+
+                       PROX_PANIC(tbase->local_ipv4 == 0, "missing local_ipv4 will route table is specified in L3 mode\n");
+
+                       // LPM might be modified runtime => do not share with other cores
+                       ret = lua_to_lpm4(prox_lua(), GLOBAL, targ->route_table, socket_id, &lpm);
+                       PROX_PANIC(ret, "Failed to load IPv4 LPM:\n%s\n", get_lua_to_errors());
+
+                       tbase->l3.ipv4_lpm = lpm->rte_lpm;
+                       tbase->l3.next_hops = prox_zmalloc(sizeof(*tbase->l3.next_hops) * MAX_HOP_INDEX, socket_id);
+                       PROX_PANIC(tbase->l3.next_hops == NULL, "Could not allocate memory for next hop\n");
+
+                       for (uint32_t i = 0; i < MAX_HOP_INDEX; i++) {
+                               if (!lpm->next_hops[i].ip_dst)
+                                       continue;
+                               tbase->l3.nb_gws++;
+                               tbase->l3.next_hops[i].ip = rte_bswap32(lpm->next_hops[i].ip_dst);
+                               int tx_port = lpm->next_hops[i].mac_port.out_idx;
+                               // gen only supports one port right now .... hence port = 0
+                               if ((tx_port > targ->nb_txports - 1) && (tx_port > targ->nb_txrings - 1)) {
+                                       PROX_PANIC(1, "Routing Table contains port %d but only %d tx port/ %d ring:\n", tx_port, targ->nb_txports, targ->nb_txrings);
+                               }
+                       }
+                       plog_info("Using routing table %s in l3 mode, with %d gateways\n", targ->route_table, tbase->l3.nb_gws);
+
+                       // Last but one "next_hop_index" is not a gateway but direct routes
+                       tbase->l3.next_hops[tbase->l3.nb_gws].ip = 0;
+                       ret = rte_lpm_add(tbase->l3.ipv4_lpm, targ->local_ipv4, targ->local_prefix, tbase->l3.nb_gws++);
+                       PROX_PANIC(ret, "Failed to add local_ipv4 "IPv4_BYTES_FMT"/%d to lpm\n", IP4(tbase->local_ipv4), targ->local_prefix);
+                       // Last "next_hop_index" is default gw
+                       tbase->l3.next_hops[tbase->l3.nb_gws].ip = rte_bswap32(targ->gateway_ipv4);
+                       if (targ->gateway_ipv4) {
+                               ret = rte_lpm_add(tbase->l3.ipv4_lpm, targ->gateway_ipv4, 0, tbase->l3.nb_gws++);
+                               PROX_PANIC(ret, "Failed to add gateway_ipv4 "IPv4_BYTES_FMT"/%d to lpm\n", IP4(tbase->l3.gw.ip), 0);
+                       }
+               }
+
                master_init_vdev(tbase->l3.tmaster, tbase->l3.reachable_port_id, targ->lconf->id, targ->id);
                name[3]++;
                struct rte_mempool *ret = rte_mempool_create(name, NB_ARP_MBUF, ARP_MBUF_SIZE, NB_CACHE_ARP_MBUF,
@@ -284,8 +363,13 @@ void task_set_local_ip(struct task_base *tbase, uint32_t ip)
 static void reset_arp_update_time(struct l3_base *l3, uint32_t ip)
 {
        uint32_t idx;
-       plogx_info("\tMAC entry for IP "IPv4_BYTES_FMT" timeout in kernel\n", IP4(ip));
-       if (ip == l3->gw.ip) {
+       plogx_dbg("MAC entry for IP "IPv4_BYTES_FMT" timeout in kernel\n", IP4(ip));
+
+       if (l3->ipv4_lpm) {
+               int ret = rte_hash_lookup(l3->ip_hash, (const void *)&ip);
+               if (ret >= 0)
+                       l3->arp_table[ret].arp_update_time = 0;
+       } else if (ip == l3->gw.ip) {
                l3->gw.arp_update_time = 0;
        } else if (l3->n_pkts < 4) {
                for (idx = 0; idx < l3->n_pkts; idx++) {
@@ -304,17 +388,34 @@ static void reset_arp_update_time(struct l3_base *l3, uint32_t ip)
        return;
 }
 
+static prox_next_hop_index_type get_nh_index(struct task_base *tbase, uint32_t gw_ip)
+{
+       // Check if gateway already exists
+       for (prox_next_hop_index_type i = 0; i < tbase->l3.nb_gws; i++) {
+               if (tbase->l3.next_hops[i].ip == gw_ip) {
+                       return i;
+               }
+       }
+       if (tbase->l3.nb_gws < MAX_HOP_INDEX) {
+               tbase->l3.next_hops[tbase->l3.nb_gws].ip = gw_ip;
+               tbase->l3.nb_gws++;
+               return tbase->l3.nb_gws - 1;
+       } else
+               return MAX_HOP_INDEX;
+}
 void handle_ctrl_plane_pkts(struct task_base *tbase, struct rte_mbuf **mbufs, uint16_t n_pkts)
 {
        uint8_t out[1];
        const uint64_t hz = rte_get_tsc_hz();
-       uint32_t ip, ip_dst, idx;
-       int j;
+       uint32_t ip, ip_dst, idx, gateway_ip, prefix;
+       prox_next_hop_index_type gateway_index;
+       int j, ret, modified_route;
        uint16_t command;
        struct ether_hdr_arp *hdr;
        struct l3_base *l3 = &tbase->l3;
        uint64_t tsc= rte_rdtsc();
-       uint64_t update_time = l3->arp_timeout * hz / 1000;
+       uint64_t arp_timeout = l3->arp_timeout * hz / 1000;
+       uint32_t nh;
 
        for (j = 0; j < n_pkts; ++j) {
                PREFETCH0(mbufs[j]);
@@ -328,6 +429,38 @@ void handle_ctrl_plane_pkts(struct task_base *tbase, struct rte_mbuf **mbufs, ui
                command = mbufs[j]->udata64 & 0xFFFF;
                plogx_dbg("\tReceived %s mbuf %p\n", actions_string[command], mbufs[j]);
                switch(command) {
+               case ROUTE_ADD_FROM_CTRL:
+                       ip = ctrl_ring_get_ip(mbufs[j]);
+                       gateway_ip = ctrl_ring_get_gateway_ip(mbufs[j]);
+                       prefix = ctrl_ring_get_prefix(mbufs[j]);
+                       gateway_index = get_nh_index(tbase, gateway_ip);
+                       if (gateway_index >= MAX_HOP_INDEX) {
+                               plog_err("Unable to find or define gateway index - too many\n");
+                               return;
+                       }
+                       modified_route = rte_lpm_is_rule_present(tbase->l3.ipv4_lpm, rte_bswap32(ip), prefix, &nh);
+                       ret = rte_lpm_add(tbase->l3.ipv4_lpm, rte_bswap32(ip), prefix, gateway_index);
+                       if (ret < 0) {
+                               plog_err("Failed to add route to "IPv4_BYTES_FMT"/%d using "IPv4_BYTES_FMT"(index = %d)\n", IP4(ip), prefix, IP4(gateway_ip), gateway_index);
+                       } else if (modified_route)
+                               plogx_dbg("Modified route to "IPv4_BYTES_FMT"/%d using "IPv4_BYTES_FMT"(index = %d) (was using "IPv4_BYTES_FMT"(index = %d)\n", IP4(ip), prefix, IP4(gateway_ip), gateway_index, IP4(tbase->l3.next_hops[nh].ip), nh);
+                       else {
+                               plogx_dbg("Added new route to "IPv4_BYTES_FMT"/%d using "IPv4_BYTES_FMT"(index = %d)\n", IP4(ip), prefix, IP4(gateway_ip), gateway_index);
+                       }
+                       break;
+               case ROUTE_DEL_FROM_CTRL:
+                       ip = ctrl_ring_get_ip(mbufs[j]);
+                       prefix = ctrl_ring_get_prefix(mbufs[j]);
+
+                       ret = rte_lpm_is_rule_present(tbase->l3.ipv4_lpm, rte_bswap32(ip), prefix, &nh);
+                       if (ret > 0) {
+                               ret = rte_lpm_delete(tbase->l3.ipv4_lpm, rte_bswap32(ip), prefix);
+                               if (ret < 0) {
+                                       plog_err("Failed to add rule\n");
+                               }
+                               plog_info("Deleting route to "IPv4_BYTES_FMT"/%d\n", IP4(ip), prefix);
+                       }
+                       break;
                case UPDATE_FROM_CTRL:
                        hdr = rte_pktmbuf_mtod(mbufs[j], struct ether_hdr_arp *);
                        ip = (mbufs[j]->udata64 >> 32) & 0xFFFFFFFF;
@@ -337,16 +470,33 @@ void handle_ctrl_plane_pkts(struct task_base *tbase, struct rte_mbuf **mbufs, ui
                                // This will cause us to send new ARP request
                                // However, as arp_timeout not touched, we should continue sending our regular IP packets
                                reset_arp_update_time(l3, ip);
-                               plogx_info("\tTimeout for MAC entry for IP "IPv4_BYTES_FMT"\n", IP4(ip));
                                return;
                        } else
                                plogx_dbg("\tUpdating MAC entry for IP "IPv4_BYTES_FMT" with MAC "MAC_BYTES_FMT"\n",
                                        IP4(ip), MAC_BYTES(hdr->arp.data.sha.addr_bytes));
-                       if (ip == l3->gw.ip) {
+
+                       if (l3->ipv4_lpm) {
+                               uint32_t nh;
+                               struct arp_table *entry;
+                               ret = rte_hash_add_key(l3->ip_hash, (const void *)&ip);
+                               if (ret < 0) {
+                                       plogx_info("Unable add ip "IPv4_BYTES_FMT" in mac_hash\n", IP4(ip));
+                               } else if ((nh = l3->arp_table[ret].nh) != MAX_HOP_INDEX) {
+                                       entry = &l3->next_hops[nh];
+                                       memcpy(&entry->mac, &(hdr->arp.data.sha), sizeof(prox_rte_ether_addr));
+                                       entry->arp_timeout = tsc + arp_timeout;
+                                       update_arp_update_time(l3, &entry->arp_update_time, l3->arp_update_time);
+                               } else {
+                                       memcpy(&l3->arp_table[ret].mac, &(hdr->arp.data.sha), sizeof(prox_rte_ether_addr));
+                                       l3->arp_table[ret].arp_timeout = tsc + arp_timeout;
+                                       update_arp_update_time(l3, &l3->arp_table[ret].arp_update_time, l3->arp_update_time);
+                               }
+                       }
+                       else if (ip == l3->gw.ip) {
                                // MAC address of the gateway
                                memcpy(&l3->gw.mac, &hdr->arp.data.sha, 6);
                                l3->flags |= FLAG_DST_MAC_KNOWN;
-                               l3->gw.arp_timeout = tsc + update_time;
+                               l3->gw.arp_timeout = tsc + arp_timeout;
                                update_arp_update_time(l3, &l3->gw.arp_update_time, l3->arp_update_time);
                        } else if (l3->n_pkts < 4) {
                                // Few packets tracked - should be faster to loop through them thean using a hash table
@@ -357,28 +507,40 @@ void handle_ctrl_plane_pkts(struct task_base *tbase, struct rte_mbuf **mbufs, ui
                                }
                                if (idx < l3->n_pkts) {
                                        memcpy(&l3->optimized_arp_table[idx].mac, &(hdr->arp.data.sha), sizeof(prox_rte_ether_addr));
-                                       l3->optimized_arp_table[idx].arp_timeout = tsc + update_time;
+                                       l3->optimized_arp_table[idx].arp_timeout = tsc + arp_timeout;
                                        update_arp_update_time(l3, &l3->optimized_arp_table[idx].arp_update_time, l3->arp_update_time);
                                }
                        } else {
-                               int ret = rte_hash_add_key(l3->ip_hash, (const void *)&ip);
+                               ret = rte_hash_add_key(l3->ip_hash, (const void *)&ip);
                                if (ret < 0) {
-                                       plogx_info("Unable add ip %d.%d.%d.%d in mac_hash\n", IP4(ip));
+                                       plogx_info("Unable add ip "IPv4_BYTES_FMT" in mac_hash\n", IP4(ip));
                                } else {
                                        memcpy(&l3->arp_table[ret].mac, &(hdr->arp.data.sha), sizeof(prox_rte_ether_addr));
-                                       l3->arp_table[ret].arp_timeout = tsc + update_time;
+                                       l3->arp_table[ret].arp_timeout = tsc + arp_timeout;
                                        update_arp_update_time(l3, &l3->arp_table[ret].arp_update_time, l3->arp_update_time);
                                }
                        }
                        tx_drop(mbufs[j]);
                        break;
                case ARP_REPLY_FROM_CTRL:
-               case ICMP_FROM_CTRL:
                case ARP_REQ_FROM_CTRL:
+                       out[0] = 0;
+                       // tx_ctrlplane_pkt does not drop packets
+                       plogx_dbg("\tForwarding (ARP) packet from master\n");
+                       tbase->aux->tx_ctrlplane_pkt(tbase, &mbufs[j], 1, out);
+                       TASK_STATS_ADD_TX_NON_DP(&tbase->aux->stats, 1);
+                       break;
+               case ICMP_FROM_CTRL:
+                       out[0] = 0;
+                       // tx_ctrlplane_pkt does not drop packets
+                       plogx_dbg("\tForwarding (PING) packet from master\n");
+                       tbase->aux->tx_ctrlplane_pkt(tbase, &mbufs[j], 1, out);
+                       TASK_STATS_ADD_TX_NON_DP(&tbase->aux->stats, 1);
+                       break;
                case PKT_FROM_TAP:
                        out[0] = 0;
                        // tx_ctrlplane_pkt does not drop packets
-                       plogx_dbg("\tForwarding (ARP/PING) packet from master\n");
+                       plogx_dbg("\tForwarding TAP packet from master\n");
                        tbase->aux->tx_ctrlplane_pkt(tbase, &mbufs[j], 1, out);
                        TASK_STATS_ADD_TX_NON_DP(&tbase->aux->stats, 1);
                        break;
index a111b94..021528d 100644 (file)
@@ -27,7 +27,7 @@
 #define FLAG_DST_MAC_KNOWN     1
 #define MAX_ARP_ENTRIES        65536
 
-#define IP4(x) x & 0xff, (x >> 8) & 0xff, (x >> 16) & 0xff, x >> 24
+#define IP4(x) x & 0xff, (x >> 8) & 0xff, (x >> 16) & 0xff, x >> 24    // From Network (BE)
 enum {
        SEND_MBUF_AND_ARP,
        SEND_MBUF,
@@ -43,6 +43,7 @@ struct arp_table {
        uint64_t arp_update_time;
        uint64_t arp_timeout;
        uint32_t ip;
+       uint32_t nh;
        prox_rte_ether_addr mac;
 };
 struct l3_base {
@@ -55,12 +56,15 @@ struct l3_base {
        uint8_t task_id;
        uint32_t arp_timeout;
        uint32_t arp_update_time;
+       uint seed;
+       prox_next_hop_index_type nb_gws;
        struct arp_table gw;
        struct arp_table optimized_arp_table[4];
        struct rte_hash *ip_hash;
        struct arp_table *arp_table;
        struct rte_mempool *arp_pool;
-       uint seed;
+       struct rte_lpm *ipv4_lpm;
+       struct arp_table *next_hops;
 };
 
 void task_init_l3(struct task_base *tbase, struct task_args *targ);
@@ -69,6 +73,7 @@ int write_dst_mac(struct task_base *tbase, struct rte_mbuf *mbuf, uint32_t *ip_d
 void task_set_gateway_ip(struct task_base *tbase, uint32_t ip);
 void task_set_local_ip(struct task_base *tbase, uint32_t ip);
 void handle_ctrl_plane_pkts(struct task_base *tbase, struct rte_mbuf **mbufs, uint16_t n_pkts);
+
 static inline void update_arp_update_time(struct l3_base *l3, uint64_t *ptr, uint32_t base)
 {
        // randomize timers - from 0.5 to 1.5 * configured time
@@ -77,5 +82,4 @@ static inline void update_arp_update_time(struct l3_base *l3, uint64_t *ptr, uin
        uint64_t rand = 500 + (1000L * rand_r(&l3->seed)) / RAND_MAX;
        *ptr = tsc + (base * rand / 1000) * hz / 1000;
 }
-
 #endif /* _PACKET_UTILS_H_ */
index 4108220..30b4cbd 100644 (file)
@@ -1389,10 +1389,23 @@ static int get_core_cfg(unsigned sindex, char *str, void *data)
        if (STR_EQ(str, "gateway ipv4")) { /* Gateway IP address used when generating */
                if ((targ->flags & TASK_ARG_L3) == 0)
                        plog_warn("gateway ipv4 configured but L3 sub mode not enabled\n");
+               if (targ->local_ipv4)
+                       targ->local_prefix = 32;
                return parse_ip(&targ->gateway_ipv4, pkey);
        }
        if (STR_EQ(str, "local ipv4")) { /* source IP address to be used for packets */
-               return parse_ip(&targ->local_ipv4, pkey);
+               struct ip4_subnet cidr;
+               if (parse_ip4_cidr(&cidr, pkey) != 0) {
+                       if (targ->gateway_ipv4)
+                               targ->local_prefix = 32;
+                       else
+                               targ->local_prefix = 0;
+                       return parse_ip(&targ->local_ipv4, pkey);
+               } else {
+                       targ->local_ipv4 = cidr.ip;
+                       targ->local_prefix = cidr.prefix;
+                       return 0;
+               }
        }
        if (STR_EQ(str, "remote ipv4")) { /* source IP address to be used for packets */
                return parse_ip(&targ->remote_ipv4, pkey);
index e181cd8..bd059a6 100644 (file)
@@ -37,6 +37,12 @@ struct prox_rte_table_params {
        uint64_t seed;
 };
 
+#if RTE_VERSION < RTE_VERSION_NUM(16,4,0,1)
+typedef uint8_t prox_next_hop_index_type;
+#else
+typedef uint32_t prox_next_hop_index_type;
+#endif
+
 #if RTE_VERSION < RTE_VERSION_NUM(17,11,0,0)
 
 static void *prox_rte_table_create(struct prox_rte_table_params *params, int socket_id, uint32_t entry_size)
index 3ef3d47..bc1671d 100644 (file)
@@ -437,11 +437,11 @@ int lua_to_next_hop(struct lua_State *L, enum lua_place from, const char *name,
        while (lua_next(L, -2)) {
                if (lua_to_int(L, TABLE, "id", &next_hop_index) ||
                    lua_to_int(L, TABLE, "port_id", &port_id) ||
-                   lua_to_ip(L, TABLE, "ip", &ip) ||
-                   lua_to_mac(L, TABLE, "mac", &mac) ||
-                   lua_to_int(L, TABLE, "mpls", &mpls))
+                   lua_to_ip(L, TABLE, "ip", &ip))
                        return -1;
 
+               lua_to_mac(L, TABLE, "mac", &mac);
+               lua_to_int(L, TABLE, "mpls", &mpls);
                PROX_PANIC(port_id >= PROX_MAX_PORTS, "Port id too high (only supporting %d ports)\n", PROX_MAX_PORTS);
                PROX_PANIC(next_hop_index >= MAX_HOP_INDEX, "Next-hop to high (only supporting %d next hops)\n", MAX_HOP_INDEX);
 
@@ -504,6 +504,7 @@ int lua_to_next_hop6(struct lua_State *L, enum lua_place from, const char *name,
        return 0;
 }
 
+#define MAX_NEW_RULES  128
 int lua_to_routes4(struct lua_State *L, enum lua_place from, const char *name, uint8_t socket, struct lpm4 *lpm)
 {
        struct ip4_subnet dst;
@@ -514,11 +515,12 @@ int lua_to_routes4(struct lua_State *L, enum lua_place from, const char *name, u
        char lpm_name[64];
        int ret;
        int pop;
+       static int count = 1;
 
        if ((pop = lua_getfrom(L, from, name)) < 0)
                return -1;
 
-       snprintf(lpm_name, sizeof(lpm_name), "IPv4_lpm_s%u", socket);
+       snprintf(lpm_name, sizeof(lpm_name), "IPv4_lpm_s%u_%d", socket, count++);
 
        if (!lua_istable(L, -1)) {
                set_err("Data is not a table\n");
@@ -531,12 +533,12 @@ int lua_to_routes4(struct lua_State *L, enum lua_place from, const char *name, u
        lua_pop(L, 1);
 #if RTE_VERSION >= RTE_VERSION_NUM(16,4,0,1)
        struct rte_lpm_config conf;
-       conf.max_rules = 2 * n_tot_rules;
+       conf.max_rules = 2 * n_tot_rules + MAX_NEW_RULES;
        conf.number_tbl8s = 256;
        conf.flags = 0;
        new_lpm = rte_lpm_create(lpm_name, socket, &conf);
 #else
-       new_lpm = rte_lpm_create(lpm_name, socket, 2 * n_tot_rules, 0);
+       new_lpm = rte_lpm_create(lpm_name, socket, 2 * n_tot_rules + MAX_NEW_RULES, 0);
 #endif
        PROX_PANIC(NULL == new_lpm, "Failed to allocate lpm\n");
 
index 4832066..6a6112b 100644 (file)
@@ -30,6 +30,8 @@
 #include "handle_master.h"
 #include "input.h" /* Needed for callback on dump */
 
+#define TCP_PORT_BGP   rte_cpu_to_be_16(179)
+
 /* _param version of the rx_pkt_hw functions are used to create two
    instances of very similar variations of these functions. The
    variations are specified by the "multi" parameter which significies
@@ -138,10 +140,15 @@ static uint16_t rx_pkt_hw_param(struct task_base *tbase, struct rte_mbuf ***mbuf
                        if (likely(hdr_arp[i]->ether_hdr.ether_type == ETYPE_IPv4)) {
                                hdr = (prox_rte_ether_hdr *)hdr_arp[i];
                                prox_rte_ipv4_hdr *pip = (prox_rte_ipv4_hdr *)(hdr + 1);
+                               prox_rte_tcp_hdr *tcp = (prox_rte_tcp_hdr *)(pip + 1);
                                if (pip->next_proto_id == IPPROTO_ICMP) {
                                        dump_l3(tbase, mbufs[i]);
                                        tx_ring(tbase, tbase->l3.ctrl_plane_ring, ICMP_TO_CTRL, mbufs[i]);
                                        skip++;
+                               } else if ((tcp->src_port == TCP_PORT_BGP) || (tcp->dst_port == TCP_PORT_BGP)) {
+                                       dump_l3(tbase, mbufs[i]);
+                                       tx_ring(tbase, tbase->l3.ctrl_plane_ring, BGP_TO_CTRL, mbufs[i]);
+                                       skip++;
                                } else if (unlikely(skip)) {
                                        mbufs[i - skip] = mbufs[i];
                                }
@@ -202,13 +209,19 @@ static inline uint16_t rx_pkt_hw1_param(struct task_base *tbase, struct rte_mbuf
                        PREFETCH0(hdr_arp[i]);
                }
                for (i = 0; i < nb_rx; i++) {
+                       // plog_info("ether_type = %x\n", hdr_arp[i]->ether_hdr.ether_type);
                        if (likely(hdr_arp[i]->ether_hdr.ether_type == ETYPE_IPv4)) {
                                hdr = (prox_rte_ether_hdr *)hdr_arp[i];
                                prox_rte_ipv4_hdr *pip = (prox_rte_ipv4_hdr *)(hdr + 1);
+                               prox_rte_tcp_hdr *tcp = (prox_rte_tcp_hdr *)(pip + 1);
                                if (pip->next_proto_id == IPPROTO_ICMP) {
                                        dump_l3(tbase, mbufs[i]);
                                        tx_ring(tbase, tbase->l3.ctrl_plane_ring, ICMP_TO_CTRL, mbufs[i]);
                                        skip++;
+                               } else if ((tcp->src_port == TCP_PORT_BGP) || (tcp->dst_port == TCP_PORT_BGP)) {
+                                       dump_l3(tbase, mbufs[i]);
+                                       tx_ring(tbase, tbase->l3.ctrl_plane_ring, BGP_TO_CTRL, mbufs[i]);
+                                       skip++;
                                } else if (unlikely(skip)) {
                                        mbufs[i - skip] = mbufs[i];
                                }
index 4108f54..98c0a8d 100644 (file)
@@ -129,6 +129,7 @@ struct task_args {
        uint32_t               gateway_ipv4;
        uint32_t               local_ipv4;
        uint32_t               remote_ipv4;
+       uint32_t               local_prefix;
        uint32_t               arp_timeout;
        uint32_t               arp_update_time;
        struct ipv6_addr       local_ipv6;    /* For IPv6 Tunnel, it's the local tunnel endpoint address */
index 8bf501f..2a4f53b 100644 (file)
@@ -845,3 +845,73 @@ void tx_ring(struct task_base *tbase, struct rte_ring *ring, uint16_t command,
                rte_pktmbuf_free(mbuf);
        }
 }
+
+void tx_ring_route(struct task_base *tbase, struct rte_ring *ring, int add, struct rte_mbuf *mbuf, uint32_t ip, uint32_t gateway_ip, uint32_t prefix)
+{
+       uint8_t command;
+       if (add)
+               command = ROUTE_ADD_FROM_CTRL;
+       else
+               command = ROUTE_DEL_FROM_CTRL;
+
+       plogx_dbg("\tSending command %s to ring %p using mbuf %p - ring size now %d\n", actions_string[command], ring, mbuf, rte_ring_free_count(ring));
+       ctrl_ring_set_command(mbuf, command);
+       ctrl_ring_set_ip(mbuf, ip);
+       ctrl_ring_set_gateway_ip(mbuf, gateway_ip);
+       ctrl_ring_set_prefix(mbuf, prefix);
+       if (tbase->aux->task_rt_dump.cur_trace) {
+               trace_one_rx_pkt(tbase, mbuf);
+       }
+       int ret = rte_ring_enqueue(ring, mbuf);
+       if (unlikely(ret != 0)) {
+               plogx_dbg("\tFail to send command %s to ring %p using mbuf %p - ring size now %d\n", actions_string[command], ring, mbuf, rte_ring_free_count(ring));
+               TASK_STATS_ADD_DROP_DISCARD(&tbase->aux->stats, 1);
+               rte_pktmbuf_free(mbuf);
+       }
+}
+
+void ctrl_ring_set_command(struct rte_mbuf *mbuf, uint64_t udata64)
+{
+       mbuf->udata64 = udata64;
+}
+
+uint64_t ctrl_ring_get_command(struct rte_mbuf *mbuf)
+{
+       return mbuf->udata64;
+}
+
+void ctrl_ring_set_ip(struct rte_mbuf *mbuf, uint32_t udata32)
+{
+       struct prox_headroom *prox_headroom = (struct prox_headroom *)(rte_pktmbuf_mtod(mbuf, uint8_t*) - sizeof(struct prox_headroom));
+       prox_headroom->ip = udata32;
+}
+
+uint32_t  ctrl_ring_get_ip(struct rte_mbuf *mbuf)
+{
+       struct prox_headroom *prox_headroom = (struct prox_headroom *)(rte_pktmbuf_mtod(mbuf, uint8_t*) - sizeof(struct prox_headroom));
+       return prox_headroom->ip;
+}
+
+void ctrl_ring_set_gateway_ip(struct rte_mbuf *mbuf, uint32_t udata32)
+{
+       struct prox_headroom *prox_headroom = (struct prox_headroom *)(rte_pktmbuf_mtod(mbuf, uint8_t*) - sizeof(struct prox_headroom));
+       prox_headroom->gateway_ip = udata32;
+}
+
+uint32_t ctrl_ring_get_gateway_ip(struct rte_mbuf *mbuf)
+{
+       struct prox_headroom *prox_headroom = (struct prox_headroom *)(rte_pktmbuf_mtod(mbuf, uint8_t*) - sizeof(struct prox_headroom));
+       return prox_headroom->gateway_ip;
+}
+
+void ctrl_ring_set_prefix(struct rte_mbuf *mbuf, uint32_t udata32)
+{
+       struct prox_headroom *prox_headroom = (struct prox_headroom *)(rte_pktmbuf_mtod(mbuf, uint8_t*) - sizeof(struct prox_headroom));
+       prox_headroom->prefix = udata32;
+}
+
+uint32_t ctrl_ring_get_prefix(struct rte_mbuf *mbuf)
+{
+       struct prox_headroom *prox_headroom = (struct prox_headroom *)(rte_pktmbuf_mtod(mbuf, uint8_t*) - sizeof(struct prox_headroom));
+       return prox_headroom->prefix;
+}
index 708a983..f7443cf 100644 (file)
 struct task_base;
 struct rte_mbuf;
 
+struct prox_headroom {
+       uint64_t command;
+       uint32_t ip;
+       uint32_t prefix;
+       uint32_t gateway_ip;
+} __attribute__((packed));
+
 void flush_queues_hw(struct task_base *tbase);
 void flush_queues_sw(struct task_base *tbase);
 
@@ -86,4 +93,14 @@ int tx_ring_cti(struct task_base *tbase, struct rte_ring *ring, uint16_t command
 void tx_ring_ip(struct task_base *tbase, struct rte_ring *ring, uint16_t command, struct rte_mbuf *mbuf, uint32_t ip);
 void tx_ring(struct task_base *tbase, struct rte_ring *ring, uint16_t command, struct rte_mbuf *mbuf);
 
+void ctrl_ring_set_command(struct rte_mbuf *mbuf, uint64_t udata64);
+uint64_t ctrl_ring_get_command(struct rte_mbuf *mbuf);
+void ctrl_ring_set_ip(struct rte_mbuf *mbuf, uint32_t udata32);
+uint32_t ctrl_ring_get_ip(struct rte_mbuf *mbuf);
+void ctrl_ring_set_gateway_ip(struct rte_mbuf *mbuf, uint32_t udata32);
+uint32_t ctrl_ring_get_gateway_ip(struct rte_mbuf *mbuf);
+void ctrl_ring_set_prefix(struct rte_mbuf *mbuf, uint32_t udata32);
+uint32_t ctrl_ring_get_prefix(struct rte_mbuf *mbuf);
+void tx_ring_route(struct task_base *tbase, struct rte_ring *ring, int add, struct rte_mbuf *mbuf, uint32_t ip, uint32_t gateway_ip, uint32_t prefix);
+
 #endif /* _TX_PKT_H_ */