maas: Avoid race condition in node fixups
[fuel.git] / mcp / patches / salt-formula-maas / 0003-Extend-wait_for-maas.py-wait_for_-attempts-arg.patch
1 ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
2 : Copyright (c) 2019 Mirantis Inc., Enea AB and others.
3 :
4 : All rights reserved. This program and the accompanying materials
5 : are made available under the terms of the Apache License, Version 2.0
6 : which accompanies this distribution, and is available at
7 : http://www.apache.org/licenses/LICENSE-2.0
8 ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
9 From: Alexandru Avadanii <Alexandru.Avadanii@enea.com>
10 Date: Sun, 23 Sep 2018 03:57:27 +0200
11 Subject: [PATCH] Extend wait_for maas.py, wait_for_* attempts arg
12
13 1. maas.py: Extend wait_for states with timeout param
14
15 Extend the wait_for states with a timeout parameter.
16 The timeout value is taken from reclass pillar data if
17 defined. Oterwise, the states use the default value.
18 Based on Ting's PR [1], slightly refactored.
19
20 2. maas.py: Extend `req_status` support to multiple values
21
22 Previously, req_status could be one of the MaaS status strings, e.g.
23 'Ready'. Extend matching to '|'-separated statuses (e.g.
24 'Ready|Deployed') to allow idempotency in MaaS machine commissioning
25 and deployment cycles.
26
27 Also provide a `maas.machines.wait_for_ready_or_deployed` sls.
28
29 3. maas.py: wait_for_*: Add attempts arg
30
31 Introduce a new parameter that allows a maximum number of automatic
32 recovery attempts for the common failures w/ machine operations.
33 If not present in pillar data, it defaults to 0 (OFF).
34
35 Common error states, possible cause and automatic recovery pattern:
36 * New
37   - usually indicates issues with BMC connectivity (no network route,
38     but on rare occassions it happens due to MaaS API being flaky);
39   - fix: delete the machine, (re)process machine definitions;
40 * Failed commissioning
41   - various causes, usually a simple retry works;
42   - fix: delete the machine, (re)process machine definitions;
43 * Failed testing
44   - incompatible hardware, missing drivers etc.
45   - usually consistent and board-specific;
46   - fix: override failed testing
47 * Allocated
48   - on rare ocassions nodes get stuck in this state instead 'Deploy';
49   - fix: mark-broken, mark-fixed, if it failed at least once before
50     perform a fio test (fixes another unrelated spurious issue with
51     encrypted disks from previous deployments), (re)deploy machines;
52 * Failed deployment
53   - various causes, usually a simple retry works;
54   - fix: same as for nodes stuck in 'Allocated';
55
56 [1] https://github.com/salt-formulas/salt-formula-maas/pull/34
57
58 Change-Id: Ifb7dd9f8fcfbbed557e47d8fdffb1f963604fb15
59 Signed-off-by: ting wu <ting.wu@enea.com>
60 Signed-off-by: Alexandru Avadanii <Alexandru.Avadanii@enea.com>
61 ---
62  README.rst                                   |  9 +++-
63  _modules/maas.py                             | 52 +++++++++++++++++---
64  maas/machines/wait_for_deployed.sls          |  2 +
65  maas/machines/wait_for_ready.sls             |  3 ++
66  maas/machines/wait_for_ready_or_deployed.sls | 15 ++++++
67  maas/map.jinja                               |  4 ++
68  tests/pillar/maas_region.sls                 |  4 ++
69  7 files changed, 81 insertions(+), 8 deletions(-)
70  create mode 100644 maas/machines/wait_for_ready_or_deployed.sls
71
72 diff --git a/README.rst b/README.rst
73 index 20da43e..78d8aef 100644
74 --- a/README.rst
75 +++ b/README.rst
76 @@ -622,12 +622,16 @@ Wait for status of selected machine's:
77              machines:
78                - kvm01
79                - kvm02
80 -            timeout: 1200 # in seconds
81 +            timeout: {{ region.timeout.ready }}
82 +            attempts: {{ region.timeout.attempts }}
83              req_status: "Ready"
84        - require:
85          - cmd: maas_login_admin
86        ...
87
88 +The timeout setting is taken from the reclass pillar data.
89 +If the pillar data is not defined, it will use the default value.
90 +
91  If module run w/\o any extra paremeters,
92  ``wait_for_machines_ready`` will wait for defined in salt
93  machines. In this case, it is usefull to skip some machines:
94 @@ -642,7 +646,8 @@ machines. In this case, it is usefull to skip some machines:
95        module.run:
96        - name: maas.wait_for_machine_status
97        - kwargs:
98 -            timeout: 1200 # in seconds
99 +            timeout: {{ region.timeout.deployed }}
100 +            attempts: {{ region.timeout.attempts }}
101              req_status: "Deployed"
102              ignore_machines:
103                 - kvm01 # in case it's broken or whatever
104 diff --git a/_modules/maas.py b/_modules/maas.py
105 index c02f104..bb70576 100644
106 --- a/_modules/maas.py
107 +++ b/_modules/maas.py
108 @@ -670,7 +670,7 @@ class DeployMachines(MaasObject):
109          if machine['status'] == self.DEPLOYED:
110              return
111          if machine['status'] != self.READY:
112 -            raise Exception('Not in ready state')
113 +            return
114          data = {
115              'system_id': machine['system_id'],
116          }
117 @@ -921,6 +921,7 @@ class MachinesStatus(MaasObject):
118              req_status: string; Polling status
119              machines:   list; machine names
120              ignore_machines: list; machine names
121 +            attempts:   max number of automatic hard retries
122          :ret: True
123                   Exception - if something fail/timeout reached
124          """
125 @@ -929,6 +930,8 @@ class MachinesStatus(MaasObject):
126          req_status = kwargs.get("req_status", "Ready")
127          to_discover = kwargs.get("machines", None)
128          ignore_machines = kwargs.get("ignore_machines", None)
129 +        attempts = kwargs.get("attempts", 0)
130 +        failed_attempts = {}
131          if not to_discover:
132              try:
133                  to_discover = __salt__['config.get']('maas')['region'][
134 @@ -943,11 +946,46 @@ class MachinesStatus(MaasObject):
135          while len(total) <= len(to_discover):
136              for m in to_discover:
137                  for discovered in MachinesStatus.execute()['machines']:
138 -                    if m == discovered['hostname'] and \
139 -                            discovered['status'].lower() == req_status.lower():
140 -                        if m in total:
141 +                    if m == discovered['hostname'] and m in total:
142 +                        req_status_list = req_status.lower().split('|')
143 +                        if discovered['status'].lower() in req_status_list:
144                              total.remove(m)
145 -
146 +                        elif attempts > 0 and (m not in failed_attempts or
147 +                                               failed_attempts[m] < attempts):
148 +                            status = discovered['status']
149 +                            sid = discovered['system_id']
150 +                            cls._maas = _create_maas_client()
151 +                            if status in ['Failed commissioning', 'New']:
152 +                                LOG.info('Machine {0} deleted'.format(sid))
153 +                                cls._maas.delete(u'api/2.0/machines/{0}/'
154 +                                    .format(sid))
155 +                                Machine().process()
156 +                            elif status in ['Failed testing']:
157 +                                data = {}
158 +                                LOG.info('Machine {0} overriden'.format(sid))
159 +                                action = 'override_failed_testing'
160 +                                cls._maas.post(u'api/2.0/machines/{0}/'
161 +                                    .format(sid), action, **data)
162 +                            elif status in ['Failed deployment', 'Allocated']:
163 +                                data = {}
164 +                                LOG.info('Machine {0} mark broken'.format(sid))
165 +                                cls._maas.post(u'api/2.0/machines/{0}/'
166 +                                    .format(sid), 'mark_broken', **data)
167 +                                time.sleep(poll_time)
168 +                                LOG.info('Machine {0} mark fixed'.format(sid))
169 +                                cls._maas.post(u'api/2.0/machines/{0}/'
170 +                                    .format(sid), 'mark_fixed', **data)
171 +                                if m in failed_attempts and failed_attempts[m]:
172 +                                    LOG.info('Machine {0} fio test'.format(sid))
173 +                                    data['testing_scripts'] = 'fio'
174 +                                    cls._maas.post(u'api/2.0/machines/{0}/'
175 +                                        .format(sid), 'commission', **data)
176 +                                DeployMachines().process()
177 +                            else:
178 +                                continue
179 +                            if m not in failed_attempts:
180 +                                failed_attempts[m] = 0
181 +                            failed_attempts[m] = failed_attempts[m] + 1
182              if len(total) <= 0:
183                  LOG.debug(
184                      "Machines:{} are:{}".format(to_discover, req_status))
185 @@ -959,7 +997,9 @@ class MachinesStatus(MaasObject):
186                  "Waiting status:{} "
187                  "for machines:{}"
188                  "\nsleep for:{}s "
189 -                "Timeout:{}s".format(req_status, total, poll_time, timeout))
190 +                "Timeout:{}s ({}s left)"
191 +                .format(req_status, total, poll_time, timeout,
192 +                    timeout - (time.time() - started_at)))
193              time.sleep(poll_time)
194
195
196 diff --git a/maas/machines/wait_for_deployed.sls b/maas/machines/wait_for_deployed.sls
197 index ebeedac..a646fdb 100644
198 --- a/maas/machines/wait_for_deployed.sls
199 +++ b/maas/machines/wait_for_deployed.sls
200 @@ -9,5 +9,7 @@ wait_for_machines_deployed:
201    - name: maas.wait_for_machine_status
202    - kwargs:
203          req_status: "Deployed"
204 +        timeout: {{ region.timeout.deployed }}
205 +        attempts: {{ region.timeout.attempts }}
206    - require:
207      - cmd: maas_login_admin
208 diff --git a/maas/machines/wait_for_ready.sls b/maas/machines/wait_for_ready.sls
209 index c5d3c28..d8a2963 100644
210 --- a/maas/machines/wait_for_ready.sls
211 +++ b/maas/machines/wait_for_ready.sls
212 @@ -7,5 +7,8 @@ maas_login_admin:
213  wait_for_machines_ready:
214    module.run:
215    - name: maas.wait_for_machine_status
216 +  - kwargs:
217 +        timeout: {{ region.timeout.ready }}
218 +        attempts: {{ region.timeout.attempts }}
219    - require:
220      - cmd: maas_login_admin
221 diff --git a/maas/machines/wait_for_ready_or_deployed.sls b/maas/machines/wait_for_ready_or_deployed.sls
222 new file mode 100644
223 index 0000000..db3dcc4
224 --- /dev/null
225 +++ b/maas/machines/wait_for_ready_or_deployed.sls
226 @@ -0,0 +1,15 @@
227 +{%- from "maas/map.jinja" import region with context %}
228 +
229 +maas_login_admin:
230 +  cmd.run:
231 +  - name: "maas-region apikey --username {{ region.admin.username }} > /var/lib/maas/.maas_credentials"
232 +
233 +wait_for_machines_ready_or_deployed:
234 +  module.run:
235 +  - name: maas.wait_for_machine_status
236 +  - kwargs:
237 +        req_status: "Ready|Deployed"
238 +        timeout: {{ region.timeout.ready }}
239 +        attempts: {{ region.timeout.attempts }}
240 +  - require:
241 +    - cmd: maas_login_admin
242 diff --git a/maas/map.jinja b/maas/map.jinja
243 index 0671435..1e6ac07 100644
244 --- a/maas/map.jinja
245 +++ b/maas/map.jinja
246 @@ -22,6 +22,10 @@ Debian:
247    bind:
248      host: 0.0.0.0
249      port: 80
250 +  timeout:
251 +    ready: 1200
252 +    deployed: 7200
253 +    attempts: 0
254  {%- endload %}
255
256  {%- set region = salt['grains.filter_by'](region_defaults, merge=salt['pillar.get']('maas:region', {})) %}
257 diff --git a/tests/pillar/maas_region.sls b/tests/pillar/maas_region.sls
258 index d3325eb..d710216 100644
259 --- a/tests/pillar/maas_region.sls
260 +++ b/tests/pillar/maas_region.sls
261 @@ -34,3 +34,7 @@ maas:
262        password: password
263        username: maas
264      salt_master_ip: 127.0.0.1
265 +    timeout:
266 +      deployed: 900
267 +      ready: 900
268 +      attempts: 2