deploy/cloud/deployment.py

   1 ###############################################################################
   2 # Copyright (c) 2015 Ericsson AB and others.
   3 # szilard.cserey@ericsson.com
   4 # All rights reserved. This program and the accompanying materials
   5 # are made available under the terms of the Apache License, Version 2.0
   6 # which accompanies this distribution, and is available at
   7 # http://www.apache.org/licenses/LICENSE-2.0
   8 ###############################################################################
   9
  10 import time
  11 import re
  12 import json
  13
  14 from common import (
  15     N,
  16     exec_cmd,
  17     parse,
  18     err,
  19     log,
  20 )
  21
  22 SEARCH_TEXT = '(err)'
  23 LOG_FILE = '/var/log/puppet.log'
  24 GREP_LINES_OF_LEADING_CONTEXT = 100
  25 GREP_LINES_OF_TRAILING_CONTEXT = 100
  26 LIST_OF_CHAR_TO_BE_ESCAPED = ['[', ']', '"']
  27 ERROR_MSGS = ['Critical nodes are not available for deployment',
  28               'offline. Remove them from environment and try again.',
  29               'Task[move_to_bootstrap/',
  30               'Failed tasks: Task[connectivity-checker/']
  31
  32
  33 class DeployNotStart(Exception):
  34     """Unable to start deployment"""
  35
  36
  37 class NodesGoOffline(Exception):
  38     """Nodes goes offline during deployment"""
  39
  40
  41 class Deployment(object):
  42
  43     def __init__(self, dea, yaml_config_dir, env_id, node_id_roles_dict,
  44                  no_health_check, deploy_timeout):
  45         self.dea = dea
  46         self.yaml_config_dir = yaml_config_dir
  47         self.env_id = env_id
  48         self.node_id_roles_dict = node_id_roles_dict
  49         self.no_health_check = no_health_check
  50         self.deploy_timeout = deploy_timeout
  51         self.pattern = re.compile(
  52             '\d\d\d\d-\d\d-\d\d\s\d\d:\d\d:\d\d')
  53
  54     def collect_error_logs(self):
  55         for node_id, roles_blade in self.node_id_roles_dict.iteritems():
  56             log_list = []
  57             cmd = ('ssh -q node-%s grep \'"%s"\' %s'
  58                    % (node_id, SEARCH_TEXT, LOG_FILE))
  59             results, _ = exec_cmd(cmd, False)
  60             for result in results.splitlines():
  61                 log_msg = ''
  62
  63                 sub_cmd = '"%s" %s' % (result, LOG_FILE)
  64                 for c in LIST_OF_CHAR_TO_BE_ESCAPED:
  65                     sub_cmd = sub_cmd.replace(c, '\%s' % c)
  66                 grep_cmd = ('grep -B%s %s'
  67                             % (GREP_LINES_OF_LEADING_CONTEXT, sub_cmd))
  68                 cmd = ('ssh -q node-%s "%s"' % (node_id, grep_cmd))
  69
  70                 details, _ = exec_cmd(cmd, False)
  71                 details_list = details.splitlines()
  72
  73                 found_prev_log = False
  74                 for i in range(len(details_list) - 2, -1, -1):
  75                     if self.pattern.match(details_list[i]):
  76                         found_prev_log = True
  77                         break
  78                 if found_prev_log:
  79                     log_msg += '\n'.join(details_list[i:-1]) + '\n'
  80
  81                 grep_cmd = ('grep -A%s %s'
  82                             % (GREP_LINES_OF_TRAILING_CONTEXT, sub_cmd))
  83                 cmd = ('ssh -q node-%s "%s"' % (node_id, grep_cmd))
  84
  85                 details, _ = exec_cmd(cmd, False)
  86                 details_list = details.splitlines()
  87
  88                 found_next_log = False
  89                 for i in range(1, len(details_list)):
  90                     if self.pattern.match(details_list[i]):
  91                         found_next_log = True
  92                         break
  93                 if found_next_log:
  94                     log_msg += '\n'.join(details_list[:i])
  95                 else:
  96                     log_msg += details
  97
  98                 if log_msg:
  99                     log_list.append(log_msg)
 100
 101             if log_list:
 102                 role = ('controller' if 'controller' in roles_blade[0]
 103                         else 'compute host')
 104                 log('_' * 40 + 'Errors in node-%s %s' % (node_id, role)
 105                     + '_' * 40)
 106                 for log_msg in log_list:
 107                     print(log_msg + '\n')
 108
 109     def run_deploy(self):
 110         SLEEP_TIME = 60
 111         abort_after = 60 * int(self.deploy_timeout)
 112         start = time.time()
 113
 114         log('Starting deployment of environment %s' % self.env_id)
 115         deploy_id = None
 116         ready = False
 117         timeout = False
 118
 119         attempts = 5
 120         while attempts > 0:
 121             try:
 122                 if time.time() > start + abort_after:
 123                     timeout = True
 124                     break
 125                 if not deploy_id:
 126                     deploy_id = self._start_deploy_task()
 127                 sts, prg, msg = self._deployment_status(deploy_id)
 128                 if sts == 'error':
 129                     log('Error during deployment: {}'.format(msg))
 130                     break
 131                 if sts == 'running':
 132                     log('Environment deployment progress: {}%'.format(prg))
 133                 elif sts == 'ready':
 134                     ready = True
 135                     break
 136                 time.sleep(SLEEP_TIME)
 137             except (DeployNotStart, NodesGoOffline) as e:
 138                 log(e)
 139                 attempts -= 1
 140                 deploy_id = None
 141                 time.sleep(SLEEP_TIME * attempts)
 142
 143         if timeout:
 144             err('Deployment timed out, environment %s is not operational, '
 145                 'snapshot will not be performed'
 146                 % self.env_id)
 147         if ready:
 148             log('Environment %s successfully deployed'
 149                 % self.env_id)
 150         else:
 151             self.collect_error_logs()
 152             err('Deployment failed, environment %s is not operational'
 153                 % self.env_id, self.collect_logs)
 154
 155     def _start_deploy_task(self):
 156         out, _ = exec_cmd('fuel2 env deploy {}'.format(self.env_id), False)
 157         id = self._deployment_task_id(out)
 158         return id
 159
 160     def _deployment_task_id(self, response):
 161         response = str(response)
 162         if response.startswith('Deployment task with id'):
 163             for s in response.split():
 164                 if s.isdigit():
 165                     return int(s)
 166         raise DeployNotStart('Unable to start deployment: {}'.format(response))
 167
 168     def _deployment_status(self, id):
 169         task = self._task_fields(id)
 170         if task['status'] == 'error':
 171             if any(msg in task['message'] for msg in ERROR_MSGS):
 172                 raise NodesGoOffline(task['message'])
 173         return task['status'], task['progress'], task['message']
 174
 175     def _task_fields(self, id):
 176         try:
 177             out, _ = exec_cmd('fuel2 task show {} -f json'.format(id), False)
 178             task_info = json.loads(out)
 179             properties = {}
 180             # for 9.0 this can be list of dicts or dict
 181             # see https://bugs.launchpad.net/fuel/+bug/1625518
 182             if isinstance(task_info, list):
 183                 for d in task_info:
 184                         properties.update({d['Field']: d['Value']})
 185             else:
 186                 return task_info
 187             return properties
 188         except ValueError as e:
 189             err('Unable to fetch task info: {}'.format(e))
 190
 191     def collect_logs(self):
 192         log('Cleaning out any previous deployment logs')
 193         exec_cmd('rm -f /var/log/remote/fuel-snapshot-*', False)
 194         exec_cmd('rm -f /root/deploy-*', False)
 195         log('Generating Fuel deploy snap-shot')
 196         if exec_cmd('fuel snapshot < /dev/null &> snapshot.log', False)[1] != 0:
 197             log('Could not create a Fuel snapshot')
 198         else:
 199             exec_cmd('mv /root/fuel-snapshot* /var/log/remote/', False)
 200
 201         log('Collecting all Fuel Snapshot & deploy log files')
 202         r, _ = exec_cmd('tar -czhf /root/deploy-%s.log.tar.gz /var/log/remote' % time.strftime("%Y%m%d-%H%M%S"), False)
 203         log(r)
 204
 205     def verify_node_status(self):
 206         node_list = parse(exec_cmd('fuel --env %s node' % self.env_id))
 207         failed_nodes = []
 208         for node in node_list:
 209             if node[N['status']] != 'ready':
 210                 failed_nodes.append((node[N['id']], node[N['status']]))
 211
 212         if failed_nodes:
 213             summary = ''
 214             for node, status in failed_nodes:
 215                 summary += '[node %s, status %s]\n' % (node, status)
 216             err('Deployment failed: %s' % summary, self.collect_logs)
 217
 218     def health_check(self):
 219         log('Now running sanity and smoke health checks')
 220         r = exec_cmd('fuel health --env %s --check sanity,smoke --force' % self.env_id)
 221         log(r)
 222         if 'failure' in r:
 223             err('Healthcheck failed!', self.collect_logs)
 224
 225     def deploy(self):
 226         self.run_deploy()
 227         self.verify_node_status()
 228         if not self.no_health_check:
 229             self.health_check()
 230         self.collect_logs()