1 ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
2 : Copyright (c) 2018 Mirantis Inc., Enea AB and others.
4 : All rights reserved. This program and the accompanying materials
5 : are made available under the terms of the Apache License, Version 2.0
6 : which accompanies this distribution, and is available at
7 : http://www.apache.org/licenses/LICENSE-2.0
8 ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
9 From: Alexandru Avadanii <Alexandru.Avadanii@enea.com>
10 Date: Sun, 23 Sep 2018 03:57:27 +0200
11 Subject: [PATCH] Extend wait_for maas.py, wait_for_* attempts arg
13 1. maas.py: Extend wait_for states with timeout param
15 Extend the wait_for states with a timeout parameter.
16 The timeout value is taken from reclass pillar data if
17 defined. Oterwise, the states use the default value.
18 Based on Ting's PR [1], slightly refactored.
20 2. maas.py: Extend `req_status` support to multiple values
22 Previously, req_status could be one of the MaaS status strings, e.g.
23 'Ready'. Extend matching to '|'-separated statuses (e.g.
24 'Ready|Deployed') to allow idempotency in MaaS machine commissioning
25 and deployment cycles.
27 Also provide a `maas.machines.wait_for_ready_or_deployed` sls.
29 3. maas.py: wait_for_*: Add attempts arg
31 Introduce a new parameter that allows a maximum number of automatic
32 recovery attempts for the common failures w/ machine operations.
33 If not present in pillar data, it defaults to 0 (OFF).
35 Common error states, possible cause and automatic recovery pattern:
37 - usually indicates issues with BMC connectivity (no network route,
38 but on rare occassions it happens due to MaaS API being flaky);
39 - fix: delete the machine, (re)process machine definitions;
40 * Failed commissioning
41 - various causes, usually a simple retry works;
42 - fix: delete the machine, (re)process machine definitions;
44 - incompatible hardware, missing drivers etc.
45 - usually consistent and board-specific;
46 - fix: override failed testing
48 - on rare ocassions nodes get stuck in this state instead 'Deploy';
49 - fix: mark-broken, mark-fixed, if it failed at least once before
50 perform a fio test (fixes another unrelated spurious issue with
51 encrypted disks from previous deployments), (re)deploy machines;
53 - various causes, usually a simple retry works;
54 - fix: same as for nodes stuck in 'Allocated';
56 [1] https://github.com/salt-formulas/salt-formula-maas/pull/34
58 Change-Id: Ifb7dd9f8fcfbbed557e47d8fdffb1f963604fb15
59 Signed-off-by: ting wu <ting.wu@enea.com>
60 Signed-off-by: Alexandru Avadanii <Alexandru.Avadanii@enea.com>
63 _modules/maas.py | 49 ++++++++++++++++++--
64 maas/machines/wait_for_deployed.sls | 2 +
65 maas/machines/wait_for_ready.sls | 3 ++
66 maas/machines/wait_for_ready_or_deployed.sls | 15 ++++++
68 tests/pillar/maas_region.sls | 4 ++
69 7 files changed, 79 insertions(+), 7 deletions(-)
70 create mode 100644 maas/machines/wait_for_ready_or_deployed.sls
72 diff --git a/README.rst b/README.rst
73 index 20da43e..78d8aef 100644
76 @@ -622,12 +622,16 @@ Wait for status of selected machine's:
80 - timeout: 1200 # in seconds
81 + timeout: {{ region.timeout.ready }}
82 + attempts: {{ region.timeout.attempts }}
85 - cmd: maas_login_admin
88 +The timeout setting is taken from the reclass pillar data.
89 +If the pillar data is not defined, it will use the default value.
91 If module run w/\o any extra paremeters,
92 ``wait_for_machines_ready`` will wait for defined in salt
93 machines. In this case, it is usefull to skip some machines:
94 @@ -642,7 +646,8 @@ machines. In this case, it is usefull to skip some machines:
96 - name: maas.wait_for_machine_status
98 - timeout: 1200 # in seconds
99 + timeout: {{ region.timeout.deployed }}
100 + attempts: {{ region.timeout.attempts }}
101 req_status: "Deployed"
103 - kvm01 # in case it's broken or whatever
104 diff --git a/_modules/maas.py b/_modules/maas.py
105 index c02f104..28e46c5 100644
106 --- a/_modules/maas.py
107 +++ b/_modules/maas.py
108 @@ -921,6 +921,7 @@ class MachinesStatus(MaasObject):
109 req_status: string; Polling status
110 machines: list; machine names
111 ignore_machines: list; machine names
112 + attempts: max number of automatic hard retries
114 Exception - if something fail/timeout reached
116 @@ -929,6 +930,8 @@ class MachinesStatus(MaasObject):
117 req_status = kwargs.get("req_status", "Ready")
118 to_discover = kwargs.get("machines", None)
119 ignore_machines = kwargs.get("ignore_machines", None)
120 + attempts = kwargs.get("attempts", 0)
121 + failed_attempts = {}
124 to_discover = __salt__['config.get']('maas')['region'][
125 @@ -943,11 +946,45 @@ class MachinesStatus(MaasObject):
126 while len(total) <= len(to_discover):
127 for m in to_discover:
128 for discovered in MachinesStatus.execute()['machines']:
129 - if m == discovered['hostname'] and \
130 - discovered['status'].lower() == req_status.lower():
132 + if m == discovered['hostname'] and m in total:
133 + req_status_list = req_status.lower().split('|')
134 + if discovered['status'].lower() in req_status_list:
137 + elif attempts > 0 and (m not in failed_attempts or
138 + failed_attempts[m] < attempts):
139 + status = discovered['status']
140 + sid = discovered['system_id']
141 + cls._maas = _create_maas_client()
142 + if status in ['Failed commissioning', 'New']:
143 + LOG.info('Machine {0} deleted'.format(sid))
144 + cls._maas.delete(u'api/2.0/machines/{0}/'
146 + Machine().process()
147 + elif status in ['Failed testing']:
149 + LOG.info('Machine {0} overriden'.format(sid))
150 + action = 'override_failed_testing'
151 + cls._maas.post(u'api/2.0/machines/{0}/'
152 + .format(sid), action, **data)
153 + elif status in ['Failed deployment', 'Allocated']:
155 + LOG.info('Machine {0} mark broken'.format(sid))
156 + cls._maas.post(u'api/2.0/machines/{0}/'
157 + .format(sid), 'mark_broken', **data)
158 + LOG.info('Machine {0} mark fixed'.format(sid))
159 + cls._maas.post(u'api/2.0/machines/{0}/'
160 + .format(sid), 'mark_fixed', **data)
161 + if m in failed_attempts and failed_attempts[m]:
162 + LOG.info('Machine {0} fio test'.format(sid))
163 + data['testing_scripts'] = 'fio'
164 + cls._maas.post(u'api/2.0/machines/{0}/'
165 + .format(sid), 'commission', **data)
166 + DeployMachines().process()
169 + if m not in failed_attempts:
170 + failed_attempts[m] = 0
171 + failed_attempts[m] = failed_attempts[m] + 1
174 "Machines:{} are:{}".format(to_discover, req_status))
175 @@ -959,7 +996,9 @@ class MachinesStatus(MaasObject):
179 - "Timeout:{}s".format(req_status, total, poll_time, timeout))
180 + "Timeout:{}s ({}s left)"
181 + .format(req_status, total, poll_time, timeout,
182 + timeout - (time.time() - started_at)))
183 time.sleep(poll_time)
186 diff --git a/maas/machines/wait_for_deployed.sls b/maas/machines/wait_for_deployed.sls
187 index ebeedac..a646fdb 100644
188 --- a/maas/machines/wait_for_deployed.sls
189 +++ b/maas/machines/wait_for_deployed.sls
190 @@ -9,5 +9,7 @@ wait_for_machines_deployed:
191 - name: maas.wait_for_machine_status
193 req_status: "Deployed"
194 + timeout: {{ region.timeout.deployed }}
195 + attempts: {{ region.timeout.attempts }}
197 - cmd: maas_login_admin
198 diff --git a/maas/machines/wait_for_ready.sls b/maas/machines/wait_for_ready.sls
199 index c5d3c28..d8a2963 100644
200 --- a/maas/machines/wait_for_ready.sls
201 +++ b/maas/machines/wait_for_ready.sls
202 @@ -7,5 +7,8 @@ maas_login_admin:
203 wait_for_machines_ready:
205 - name: maas.wait_for_machine_status
207 + timeout: {{ region.timeout.ready }}
208 + attempts: {{ region.timeout.attempts }}
210 - cmd: maas_login_admin
211 diff --git a/maas/machines/wait_for_ready_or_deployed.sls b/maas/machines/wait_for_ready_or_deployed.sls
213 index 0000000..db3dcc4
215 +++ b/maas/machines/wait_for_ready_or_deployed.sls
217 +{%- from "maas/map.jinja" import region with context %}
221 + - name: "maas-region apikey --username {{ region.admin.username }} > /var/lib/maas/.maas_credentials"
223 +wait_for_machines_ready_or_deployed:
225 + - name: maas.wait_for_machine_status
227 + req_status: "Ready|Deployed"
228 + timeout: {{ region.timeout.ready }}
229 + attempts: {{ region.timeout.attempts }}
231 + - cmd: maas_login_admin
232 diff --git a/maas/map.jinja b/maas/map.jinja
233 index 0671435..1e6ac07 100644
236 @@ -22,6 +22,10 @@ Debian:
246 {%- set region = salt['grains.filter_by'](region_defaults, merge=salt['pillar.get']('maas:region', {})) %}
247 diff --git a/tests/pillar/maas_region.sls b/tests/pillar/maas_region.sls
248 index d3325eb..d710216 100644
249 --- a/tests/pillar/maas_region.sls
250 +++ b/tests/pillar/maas_region.sls
251 @@ -34,3 +34,7 @@ maas:
254 salt_master_ip: 127.0.0.1