NFVBENCH-189: Add a fix to work around the i40e_VF port initialization issue
[nfvbench.git] / nfvbench / traffic_gen / trex_gen.py
index f5c2afb..4e20f73 100644 (file)
@@ -15,6 +15,7 @@
 
 import math
 import os
+import sys
 import random
 import time
 import traceback
@@ -674,9 +675,78 @@ class TRex(AbstractTrafficGenerator):
     def __connect(self, client):
         client.connect()
 
+    def __local_server_status(self):
+        """ The TRex server may have started but failed initializing... and stopped.
+        This piece of code is especially designed to address
+        the case when a fatal failure occurs on a DPDK init call.
+        The TRex algorihm should be revised to include some missing timeouts (?)
+        status returned:
+          0: no error detected
+          1: fatal error detected - should lead to exiting the run
+          2: error detected that could be solved by starting again
+        The diagnostic is based on parsing the local trex log file (improvable)
+        """
+        status = 0
+        message = None
+        failure = None
+        exited = None
+        cause = None
+        error = None
+        before = None
+        after = None
+        last = None
+        try:
+            with open('/tmp/trex.log', 'r') as trex_log:
+                for _line in trex_log:
+                    line = _line.strip()
+                    if line.startswith('Usage:'):
+                        break
+                    if 'ports are bound' in line:
+                        continue
+                    if 'please wait' in line:
+                        continue
+                    if 'exit' in line.lower():
+                        exited = line
+                    elif 'cause' in line.lower():
+                        cause = line
+                    elif 'fail' in line.lower():
+                        failure = line
+                    elif 'msg' in line.lower():
+                        message = line
+                    elif (error is not None) and line:
+                        after = line
+                    elif line.startswith('Error:') or line.startswith('ERROR'):
+                        error = line
+                        before = last
+                    last = line
+        except FileNotFoundError:
+            pass
+        if exited is not None:
+            status = 1
+            LOG.info("\x1b[1m%s\x1b[0m %s", 'TRex failed initializing:', exited)
+            if cause is not None:
+                LOG.info("TRex [cont'd] %s", cause)
+            if failure is not None:
+                LOG.info("TRex [cont'd] %s", failure)
+            if message is not None:
+                LOG.info("TRex [cont'd] %s", message)
+                if 'not supported yet' in message.lower():
+                    LOG.info("TRex [cont'd] Try starting again!")
+                    status = 2
+        elif error is not None:
+            status = 1
+            LOG.info("\x1b[1m%s\x1b[0m %s", 'TRex failed initializing:', error)
+            if after is not None:
+                LOG.info("TRex [cont'd] %s", after)
+            elif before is not None:
+                LOG.info("TRex [cont'd] %s", before)
+        return status
+
     def __connect_after_start(self):
         # after start, Trex may take a bit of time to initialize
         # so we need to retry a few times
+        # we try to capture recoverable error cases (checking status)
+        status = 0
         for it in range(self.config.generic_retry_count):
             try:
                 time.sleep(1)
@@ -685,10 +755,23 @@ class TRex(AbstractTrafficGenerator):
             except Exception as ex:
                 if it == (self.config.generic_retry_count - 1):
                     raise
+                status = self.__local_server_status()
+                if status > 0:
+                    # No need to wait anymore, something went wrong and TRex exited
+                    if status == 1:
+                        LOG.info("\x1b[1m%s\x1b[0m", 'TRex failed starting!')
+                        print("More information? Try the command: "
+                            + "\x1b[1mnfvbench --show-trex-log\x1b[0m")
+                        sys.exit(0)
+                    if status == 2:
+                        # a new start will follow
+                        return status
                 LOG.info("Retrying connection to TRex (%s)...", ex.msg)
+        return status
 
     def connect(self):
         """Connect to the TRex server."""
+        status = 0
         server_ip = self.generator_config.ip
         LOG.info("Connecting to TRex (%s)...", server_ip)
 
@@ -700,13 +783,20 @@ class TRex(AbstractTrafficGenerator):
             if server_ip == '127.0.0.1':
                 config_updated = self.__check_config()
                 if config_updated or self.config.restart:
-                    self.__restart()
+                    status = self.__restart()
         except (TimeoutError, STLError) as e:
             if server_ip == '127.0.0.1':
-                self.__start_local_server()
+                status = self.__start_local_server()
             else:
                 raise TrafficGeneratorException(e.message) from e
 
+        if status == 2:
+            # Workaround in case of a failed TRex server initialization
+            # we try to start it again (twice maximum)
+            # which may allow low level initialization to complete.
+            if self.__start_local_server() == 2:
+                self.__start_local_server()
+
         ports = list(self.generator_config.ports)
         self.port_handle = ports
         # Prepare the ports
@@ -742,7 +832,7 @@ class TRex(AbstractTrafficGenerator):
         try:
             LOG.info("Starting TRex ...")
             self.__start_server()
-            self.__connect_after_start()
+            status = self.__connect_after_start()
         except (TimeoutError, STLError) as e:
             LOG.error('Cannot connect to TRex')
             LOG.error(traceback.format_exc())
@@ -762,6 +852,7 @@ class TRex(AbstractTrafficGenerator):
             else:
                 message = e.message
             raise TrafficGeneratorException(message) from e
+        return status
 
     def __start_server(self):
         server = TRexTrafficServer()
@@ -780,7 +871,8 @@ class TRex(AbstractTrafficGenerator):
             if not self.client.is_connected():
                 LOG.info("TRex is stopped...")
                 break
-        self.__start_local_server()
+        # Start and report a possible failure
+        return self.__start_local_server()
 
     def __stop_server(self):
         if self.generator_config.ip == '127.0.0.1':
@@ -869,8 +961,10 @@ class TRex(AbstractTrafficGenerator):
                           chain_count)
                 break
 
-        # if the capture from the TRex console was started before the arp request step,
-        # it keeps 'service_mode' enabled, otherwise, it disables the 'service_mode'
+        # A traffic capture may have been started (from a T-Rex console) at this time.
+        # If asked so, we keep the service mode enabled here, and disable it otherwise.
+        #  | Disabling the service mode while a capture is in progress
+        #  | would cause the application to stop/crash with an error.
         if not self.config.service_mode:
             self.client.set_service_mode(ports=self.port_handle, enabled=False)
         if len(arp_dest_macs) == len(self.port_handle):
@@ -889,7 +983,8 @@ class TRex(AbstractTrafficGenerator):
                     total_rate += int(r['rate_pps'])
             else:
                 mult = 1
-                total_rate = utils.convert_rates(l2frame_size, rates[0], intf_speed)
+                r = utils.convert_rates(l2frame_size, rates[0], intf_speed)
+                total_rate = int(r['rate_pps'])
             # rate must be enough for latency stream and at least 1 pps for base stream per chain
             required_rate = (self.LATENCY_PPS + 1) * self.config.service_chain_count * mult
             result = utils.convert_rates(l2frame_size,
@@ -1020,8 +1115,10 @@ class TRex(AbstractTrafficGenerator):
         if self.capture_id:
             self.client.stop_capture(capture_id=self.capture_id['id'])
             self.capture_id = None
-            # if the capture from TRex console was started before the connectivity step,
-            # it keeps 'service_mode' enabled, otherwise, it disables the 'service_mode'
+            # A traffic capture may have been started (from a T-Rex console) at this time.
+            # If asked so, we keep the service mode enabled here, and disable it otherwise.
+            #  | Disabling the service mode while a capture is in progress
+            #  | would cause the application to stop/crash with an error.
             if not self.config.service_mode:
                 self.client.set_service_mode(ports=self.port_handle, enabled=False)
 
@@ -1036,5 +1133,5 @@ class TRex(AbstractTrafficGenerator):
                 pass
 
     def set_service_mode(self, enabled=True):
-        """Enable/disable the 'service_mode'."""
+        """Enable/disable the 'service' mode."""
         self.client.set_service_mode(ports=self.port_handle, enabled=enabled)