Merge "Support different format of fuel task info"
[fuel.git] / deploy / cloud / deployment.py
1 ###############################################################################
2 # Copyright (c) 2015 Ericsson AB and others.
3 # szilard.cserey@ericsson.com
4 # All rights reserved. This program and the accompanying materials
5 # are made available under the terms of the Apache License, Version 2.0
6 # which accompanies this distribution, and is available at
7 # http://www.apache.org/licenses/LICENSE-2.0
8 ###############################################################################
9
10 import time
11 import re
12 import json
13
14 from common import (
15     N,
16     E,
17     exec_cmd,
18     run_proc,
19     run_proc_wait_terminated,
20     run_proc_kill,
21     parse,
22     err,
23     log,
24     delete,
25 )
26
27 SEARCH_TEXT = '(err)'
28 LOG_FILE = '/var/log/puppet.log'
29 GREP_LINES_OF_LEADING_CONTEXT = 100
30 GREP_LINES_OF_TRAILING_CONTEXT = 100
31 LIST_OF_CHAR_TO_BE_ESCAPED = ['[', ']', '"']
32
33
34 class DeployNotStart(Exception):
35     """Unable to start deployment"""
36
37
38 class NodesGoOffline(Exception):
39     """Nodes goes offline during deployment"""
40
41
42 class Deployment(object):
43
44     def __init__(self, dea, yaml_config_dir, env_id, node_id_roles_dict,
45                  no_health_check, deploy_timeout):
46         self.dea = dea
47         self.yaml_config_dir = yaml_config_dir
48         self.env_id = env_id
49         self.node_id_roles_dict = node_id_roles_dict
50         self.no_health_check = no_health_check
51         self.deploy_timeout = deploy_timeout
52         self.pattern = re.compile(
53             '\d\d\d\d-\d\d-\d\d\s\d\d:\d\d:\d\d')
54
55     def collect_error_logs(self):
56         for node_id, roles_blade in self.node_id_roles_dict.iteritems():
57             log_list = []
58             cmd = ('ssh -q node-%s grep \'"%s"\' %s'
59                    % (node_id, SEARCH_TEXT, LOG_FILE))
60             results, _ = exec_cmd(cmd, False)
61             for result in results.splitlines():
62                 log_msg = ''
63
64                 sub_cmd = '"%s" %s' % (result, LOG_FILE)
65                 for c in LIST_OF_CHAR_TO_BE_ESCAPED:
66                     sub_cmd = sub_cmd.replace(c, '\%s' % c)
67                 grep_cmd = ('grep -B%s %s'
68                             % (GREP_LINES_OF_LEADING_CONTEXT, sub_cmd))
69                 cmd = ('ssh -q node-%s "%s"' % (node_id, grep_cmd))
70
71                 details, _ = exec_cmd(cmd, False)
72                 details_list = details.splitlines()
73
74                 found_prev_log = False
75                 for i in range(len(details_list) - 2, -1, -1):
76                     if self.pattern.match(details_list[i]):
77                         found_prev_log = True
78                         break
79                 if found_prev_log:
80                     log_msg += '\n'.join(details_list[i:-1]) + '\n'
81
82                 grep_cmd = ('grep -A%s %s'
83                             % (GREP_LINES_OF_TRAILING_CONTEXT, sub_cmd))
84                 cmd = ('ssh -q node-%s "%s"' % (node_id, grep_cmd))
85
86                 details, _ = exec_cmd(cmd, False)
87                 details_list = details.splitlines()
88
89                 found_next_log = False
90                 for i in range(1, len(details_list)):
91                     if self.pattern.match(details_list[i]):
92                         found_next_log = True
93                         break
94                 if found_next_log:
95                     log_msg += '\n'.join(details_list[:i])
96                 else:
97                     log_msg += details
98
99                 if log_msg:
100                     log_list.append(log_msg)
101
102             if log_list:
103                 role = ('controller' if 'controller' in roles_blade[0]
104                         else 'compute host')
105                 log('_' * 40 + 'Errors in node-%s %s' % (node_id, role)
106                     + '_' * 40)
107                 for log_msg in log_list:
108                     print(log_msg + '\n')
109
110     def run_deploy(self):
111         SLEEP_TIME = 60
112         abort_after = 60 * int(self.deploy_timeout)
113         start = time.time()
114
115         log('Starting deployment of environment %s' % self.env_id)
116         deploy_id = None
117         ready = False
118         timeout = False
119
120         attempts = 0
121         while attempts < 3:
122             try:
123                 if time.time() > start + abort_after:
124                     timeout = True
125                     break
126                 if not deploy_id:
127                     deploy_id = self._start_deploy_task()
128                 sts, prg, msg = self._deployment_status(deploy_id)
129                 if sts == 'error':
130                     log('Error during deployment: {}'.format(msg))
131                     break
132                 if sts == 'running':
133                     log('Environmnent deploymnet progress: {}%'.format(prg))
134                 elif sts == 'ready':
135                     ready = True
136                     break
137                 time.sleep(SLEEP_TIME)
138             except (DeployNotStart, NodesGoOffline) as e:
139                 log(e)
140                 attempts += 1
141                 deploy_id = None
142                 time.sleep(SLEEP_TIME * attempts)
143
144         if timeout:
145             err('Deployment timed out, environment %s is not operational, '
146                 'snapshot will not be performed'
147                 % self.env_id)
148         if ready:
149             log('Environment %s successfully deployed'
150                 % self.env_id)
151         else:
152             self.collect_error_logs()
153             err('Deployment failed, environment %s is not operational'
154                 % self.env_id, self.collect_logs)
155
156     def _start_deploy_task(self):
157         out, _ = exec_cmd('fuel2 env deploy {}'.format(self.env_id), False)
158         id = self._deployment_task_id(out)
159         return id
160
161     def _deployment_task_id(self, response):
162         response = str(response)
163         if response.startswith('Deployment task with id'):
164             for s in response.split():
165                 if s.isdigit():
166                     return int(s)
167         raise DeployNotStart('Unable to start deployment: {}'.format(response))
168
169     def _deployment_status(self, id):
170         task = self._task_fields(id)
171         if task['status'] == 'error':
172             if task['message'].endswith(
173                     'offline. Remove them from environment and try again.'):
174                 raise NodesGoOffline(task['message'])
175         return task['status'], task['progress'], task['message']
176
177     def _task_fields(self, id):
178         try:
179             out, _ = exec_cmd('fuel2 task show {} -f json'.format(id), False)
180             task_info = json.loads(out)
181             properties = {}
182             # for 9.0 this can be list of dicts or dict
183             # see https://bugs.launchpad.net/fuel/+bug/1625518
184             if isinstance(task_info, list):
185                 for d in task_info:
186                         properties.update({d['Field']: d['Value']})
187             else:
188                 return task_info
189             return properties
190         except ValueError as e:
191             err('Unable to fetch task info: {}'.format(e))
192
193     def collect_logs(self):
194         log('Cleaning out any previous deployment logs')
195         exec_cmd('rm -f /var/log/remote/fuel-snapshot-*', False)
196         exec_cmd('rm -f /root/deploy-*', False)
197         log('Generating Fuel deploy snap-shot')
198         if exec_cmd('fuel snapshot < /dev/null &> snapshot.log', False)[1] <> 0:
199             log('Could not create a Fuel snapshot')
200         else:
201             exec_cmd('mv /root/fuel-snapshot* /var/log/remote/', False)
202
203         log('Collecting all Fuel Snapshot & deploy log files')
204         r, _ = exec_cmd('tar -czhf /root/deploy-%s.log.tar.gz /var/log/remote' % time.strftime("%Y%m%d-%H%M%S"), False)
205         log(r)
206
207     def verify_node_status(self):
208         node_list = parse(exec_cmd('fuel --env %s node' % self.env_id))
209         failed_nodes = []
210         for node in node_list:
211             if node[N['status']] != 'ready':
212                 failed_nodes.append((node[N['id']], node[N['status']]))
213
214         if failed_nodes:
215             summary = ''
216             for node, status in failed_nodes:
217                 summary += '[node %s, status %s]\n' % (node, status)
218             err('Deployment failed: %s' % summary, self.collect_logs)
219
220     def health_check(self):
221         log('Now running sanity and smoke health checks')
222         r = exec_cmd('fuel health --env %s --check sanity,smoke --force' % self.env_id)
223         log(r)
224         if 'failure' in r:
225             err('Healthcheck failed!', self.collect_logs)
226
227     def deploy(self):
228         self.run_deploy()
229         self.verify_node_status()
230         if not self.no_health_check:
231             self.health_check()
232         self.collect_logs()