#!/bin/bash -e
-# shellcheck disable=SC1090,SC2155
+# shellcheck disable=SC1090
##############################################################################
-# Copyright (c) 2017 Mirantis Inc., Enea AB and others.
+# Copyright (c) 2018 Mirantis Inc., Enea AB and others.
# All rights reserved. This program and the accompanying materials
# are made available under the terms of the Apache License, Version 2.0
# which accompanies this distribution, and is available at
bm_nodes=$(salt --out yaml 'mas01*' pillar.get maas:region:machines | \
awk '/^\s+\w+[[:digit:]]+:$/ {gsub(/:$/, "*"); printf "%s ", $1}')
-# Wait for MaaS commissioning/deploy to finish, retry on failure
-function maas_fixup() {
- local statuscmd="salt 'mas01*' --out yaml state.apply maas.machines.status"
- local ncount=$(salt --out yaml 'mas01*' pillar.get maas:region:machines | \
- grep -cE '^\s{2}\w+:$')
-
- # wait_for has 10sec timeout * 96 = 16 min > 15min for Failed state
- wait_for 96 "${statuscmd} | tee /dev/stderr | " \
- "grep -Eq '((Deployed|Ready): ${ncount}|status: (Failed|Allocated))'"
- local statusout=$(eval "${statuscmd}")
-
- local fcnodes=$(echo "${statusout}" | \
- grep -Pzo 'status: Failed commissioning\n\s+system_id: \K.+\n')
- local ftnodes=$(echo "${statusout}" | \
- grep -Pzo 'status: Failed testing\n\s+system_id: \K.+\n')
- for node_system_id in ${fcnodes}; do
- salt -C 'mas01*' state.apply maas.machines.delete \
- pillar="{'system_id': '${node_system_id}'}"
- sleep 10
- done
- for node_system_id in ${ftnodes}; do
- salt -C 'mas01*' state.apply maas.machines.override_failed_testing \
- pillar="{'system_id': '${node_system_id}'}"
- sleep 10
- done
- if [ -n "${fcnodes}" ] || [ -n "${ftnodes}" ]; then
- salt -C 'mas01*' state.apply maas.machines
- return 1
- fi
-
- local fdnodes=$(echo "${statusout}" | \
- grep -Pzo 'status: (Failed deployment|Allocated)\n\s+system_id: \K.+\n')
- local rnodes=$(echo "${statusout}" | \
- grep -Pzo 'status: Ready\n\s+system_id: \K.+\n')
- for node_system_id in ${fdnodes}; do
- salt -C 'mas01*' state.apply maas.machines.mark_broken_fixed \
- pillar="{'system_id': '${node_system_id}'}"
- sleep 10
- done
- if [ -n "${fdnodes}" ] || [ -n "${rnodes}" ]; then
- salt -C 'mas01*' state.apply maas.machines.storage
- salt -C 'mas01*' state.apply maas.machines.deploy
- return 1
- fi
-
- return 0
-}
-
# Optionally destroy MaaS machines from a previous run
if [ "${ERASE_ENV}" -gt 1 ]; then
- set +e; dnodes=$(salt 'mas01*' --out yaml state.apply maas.machines.status | \
- grep -Pzo '\s+system_id: \K.+\n'); set -e
cleanup_uefi
- for node_system_id in ${dnodes}; do
- salt -C 'mas01*' state.apply maas.machines.delete \
- pillar="{'system_id': '${node_system_id}'}"
- sleep 10
+ for node_hostname in ${bm_nodes//\*/}; do
+ salt -C 'mas01*' maasng.delete_machine "${node_hostname}" || true
done
fi
# MaaS rack/region controller, node commissioning
-wait_for 10.0 "salt -C 'mas01*' state.apply linux,salt,openssh,ntp"
-salt -C 'mas01*' state.apply maas.pxe_nat
+wait_for 10.0 "salt -C 'mas01*' state.apply linux,salt,openssh,ntp,iptables"
salt -C 'mas01*' state.apply maas.cluster
wait_for 10 "salt -C 'mas01*' state.apply maas.region"
xargs --no-run-if-empty -I{} salt-key -yd {}
# MaaS node deployment
-wait_for 10 maas_fixup
-
-salt -C 'mas01*' pillar.item\
- maas:region:admin:username \
- maas:region:admin:password
+if [ -n "${bm_nodes}" ]; then
+ notify "[NOTE] MaaS operations might take a long time, please be patient" 2
+ salt -C 'mas01*' state.apply maas.machines.wait_for_ready_or_deployed
+ salt -C 'mas01*' state.apply maas.machines.storage
+ salt -C 'mas01*' state.apply maas.machines.deploy
+ salt -C 'mas01*' state.apply maas.machines.wait_for_deployed
+fi
# Check all baremetal nodes are available
wait_for 10.0 "(for n in ${bm_nodes}; do salt \${n} test.ping 2>/dev/null || exit; done)"
+++ /dev/null
-From: Alexandru Avadanii <Alexandru.Avadanii@enea.com>
-Date: Sun, 19 Aug 2018 05:38:27 +0200
-Subject: [PATCH] Add machines.delete & co, pxe_nat sls
-
-Signed-off-by: Alexandru Avadanii <Alexandru.Avadanii@enea.com>
----
- maas/machines/delete.sls | 20 ++++++++++++
- maas/machines/mark_broken_fixed.sls | 20 ++++++++++++
- maas/machines/override_failed_testing.sls | 20 ++++++++++++
- maas/pxe_nat.sls | 37 +++++++++++++++++++++++
- 4 files changed, 97 insertions(+)
- create mode 100644 maas/machines/delete.sls
- create mode 100644 maas/machines/mark_broken_fixed.sls
- create mode 100644 maas/machines/override_failed_testing.sls
- create mode 100644 maas/pxe_nat.sls
-
-diff --git a/maas/machines/delete.sls b/maas/machines/delete.sls
-new file mode 100644
-index 0000000..2903f92
---- /dev/null
-+++ b/maas/machines/delete.sls
-@@ -0,0 +1,20 @@
-+##############################################################################
-+# Copyright (c) 2017 Mirantis Inc., Enea AB and others.
-+# All rights reserved. This program and the accompanying materials
-+# are made available under the terms of the Apache License, Version 2.0
-+# which accompanies this distribution, and is available at
-+# http://www.apache.org/licenses/LICENSE-2.0
-+##############################################################################
-+{%- from "maas/map.jinja" import region with context %}
-+
-+maas_login_admin:
-+ cmd.run:
-+ - name: "maas-region apikey --username {{ region.admin.username }} > /var/lib/maas/.maas_credentials"
-+ - unless: 'test -e /var/lib/maas/.maas_credentials'
-+
-+# TODO: implement delete_machine via _modules/maas.py
-+delete_machine:
-+ cmd.run:
-+ - name: "maas login {{ region.admin.username }} http://{{ region.bind.host }}:5240/MAAS/api/2.0 - < /var/lib/maas/.maas_credentials && maas opnfv machine delete {{ pillar['system_id'] }}"
-+ - require:
-+ - cmd: maas_login_admin
-diff --git a/maas/machines/mark_broken_fixed.sls b/maas/machines/mark_broken_fixed.sls
-new file mode 100644
-index 0000000..46691bb
---- /dev/null
-+++ b/maas/machines/mark_broken_fixed.sls
-@@ -0,0 +1,20 @@
-+##############################################################################
-+# Copyright (c) 2017 Mirantis Inc., Enea AB and others.
-+# All rights reserved. This program and the accompanying materials
-+# are made available under the terms of the Apache License, Version 2.0
-+# which accompanies this distribution, and is available at
-+# http://www.apache.org/licenses/LICENSE-2.0
-+##############################################################################
-+{%- from "maas/map.jinja" import region with context %}
-+
-+maas_login_admin:
-+ cmd.run:
-+ - name: "maas-region apikey --username {{ region.admin.username }} > /var/lib/maas/.maas_credentials"
-+ - unless: 'test -e /var/lib/maas/.maas_credentials'
-+
-+# TODO: implement mark_broken_fixed_machine via _modules/maas.py
-+mark_broken_fixed_machine:
-+ cmd.run:
-+ - name: "maas login {{ region.admin.username }} http://{{ region.bind.host }}:5240/MAAS/api/2.0 - < /var/lib/maas/.maas_credentials && maas opnfv machine mark-broken {{ pillar['system_id'] }} && sleep 10 && maas opnfv machine mark-fixed {{ pillar['system_id'] }} && maas opnfv machine test {{ pillar['system_id'] }} testing_scripts=fio"
-+ - require:
-+ - cmd: maas_login_admin
-diff --git a/maas/machines/override_failed_testing.sls b/maas/machines/override_failed_testing.sls
-new file mode 100644
-index 0000000..e7fe1d2
---- /dev/null
-+++ b/maas/machines/override_failed_testing.sls
-@@ -0,0 +1,20 @@
-+##############################################################################
-+# Copyright (c) 2018 Mirantis Inc., Enea AB and others.
-+# All rights reserved. This program and the accompanying materials
-+# are made available under the terms of the Apache License, Version 2.0
-+# which accompanies this distribution, and is available at
-+# http://www.apache.org/licenses/LICENSE-2.0
-+##############################################################################
-+{%- from "maas/map.jinja" import region with context %}
-+
-+maas_login_admin:
-+ cmd.run:
-+ - name: "maas-region apikey --username {{ region.admin.username }} > /var/lib/maas/.maas_credentials"
-+ - unless: 'test -e /var/lib/maas/.maas_credentials'
-+
-+# TODO: implement override_failed_testing via _modules/maas.py
-+mark_broken_fixed_machine:
-+ cmd.run:
-+ - name: "maas login {{ region.admin.username }} http://{{ region.bind.host }}:5240/MAAS/api/2.0 - < /var/lib/maas/.maas_credentials && maas opnfv machine override-failed-testing {{ pillar['system_id'] }}"
-+ - require:
-+ - cmd: maas_login_admin
-diff --git a/maas/pxe_nat.sls b/maas/pxe_nat.sls
-new file mode 100644
-index 0000000..8a03c4f
---- /dev/null
-+++ b/maas/pxe_nat.sls
-@@ -0,0 +1,37 @@
-+##############################################################################
-+# Copyright (c) 2017 Mirantis Inc., Enea AB and others.
-+# All rights reserved. This program and the accompanying materials
-+# are made available under the terms of the Apache License, Version 2.0
-+# which accompanies this distribution, and is available at
-+# http://www.apache.org/licenses/LICENSE-2.0
-+##############################################################################
-+net.ipv4.ip_forward:
-+ sysctl.present:
-+ - value: 1
-+
-+iptables_pxe_nat:
-+ iptables.append:
-+ - table: nat
-+ - chain: POSTROUTING
-+ - jump: MASQUERADE
-+ - destination: 0/0
-+ - source: {{ salt['pillar.get']('_param:single_address') }}/{{ salt['pillar.get']('_param:opnfv_net_admin_mask') }}
-+ - save: True
-+
-+iptables_pxe_source:
-+ iptables.append:
-+ - table: filter
-+ - chain: INPUT
-+ - jump: ACCEPT
-+ - destination: 0/0
-+ - source: {{ salt['pillar.get']('_param:single_address') }}/{{ salt['pillar.get']('_param:opnfv_net_admin_mask') }}
-+ - save: True
-+
-+iptables_pxe_destination:
-+ iptables.append:
-+ - table: filter
-+ - chain: INPUT
-+ - jump: ACCEPT
-+ - destination: {{ salt['pillar.get']('_param:single_address') }}/{{ salt['pillar.get']('_param:opnfv_net_admin_mask') }}
-+ - source: 0/0
-+ - save: True
--- /dev/null
+::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+: Copyright (c) 2018 Mirantis Inc., Enea AB and others.
+:
+: All rights reserved. This program and the accompanying materials
+: are made available under the terms of the Apache License, Version 2.0
+: which accompanies this distribution, and is available at
+: http://www.apache.org/licenses/LICENSE-2.0
+::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
+From: Alexandru Avadanii <Alexandru.Avadanii@enea.com>
+Date: Sun, 23 Sep 2018 03:57:27 +0200
+Subject: [PATCH] Extend wait_for maas.py, wait_for_* attempts arg
+
+1. maas.py: Extend wait_for states with timeout param
+
+Extend the wait_for states with a timeout parameter.
+The timeout value is taken from reclass pillar data if
+defined. Oterwise, the states use the default value.
+Based on Ting's PR [1], slightly refactored.
+
+2. maas.py: Extend `req_status` support to multiple values
+
+Previously, req_status could be one of the MaaS status strings, e.g.
+'Ready'. Extend matching to '|'-separated statuses (e.g.
+'Ready|Deployed') to allow idempotency in MaaS machine commissioning
+and deployment cycles.
+
+Also provide a `maas.machines.wait_for_ready_or_deployed` sls.
+
+3. maas.py: wait_for_*: Add attempts arg
+
+Introduce a new parameter that allows a maximum number of automatic
+recovery attempts for the common failures w/ machine operations.
+If not present in pillar data, it defaults to 0 (OFF).
+
+Common error states, possible cause and automatic recovery pattern:
+* New
+ - usually indicates issues with BMC connectivity (no network route,
+ but on rare occassions it happens due to MaaS API being flaky);
+ - fix: delete the machine, (re)process machine definitions;
+* Failed commissioning
+ - various causes, usually a simple retry works;
+ - fix: delete the machine, (re)process machine definitions;
+* Failed testing
+ - incompatible hardware, missing drivers etc.
+ - usually consistent and board-specific;
+ - fix: override failed testing
+* Allocated
+ - on rare ocassions nodes get stuck in this state instead 'Deploy';
+ - fix: mark-broken, mark-fixed, if it failed at least once before
+ perform a fio test (fixes another unrelated spurious issue with
+ encrypted disks from previous deployments), (re)deploy machines;
+* Failed deployment
+ - various causes, usually a simple retry works;
+ - fix: same as for nodes stuck in 'Allocated';
+
+[1] https://github.com/salt-formulas/salt-formula-maas/pull/34
+
+Change-Id: Ifb7dd9f8fcfbbed557e47d8fdffb1f963604fb15
+Signed-off-by: ting wu <ting.wu@enea.com>
+Signed-off-by: Alexandru Avadanii <Alexandru.Avadanii@enea.com>
+---
+ README.rst | 9 +++-
+ _modules/maas.py | 49 ++++++++++++++++++--
+ maas/machines/wait_for_deployed.sls | 2 +
+ maas/machines/wait_for_ready.sls | 3 ++
+ maas/machines/wait_for_ready_or_deployed.sls | 15 ++++++
+ maas/map.jinja | 4 ++
+ tests/pillar/maas_region.sls | 4 ++
+ 7 files changed, 79 insertions(+), 7 deletions(-)
+ create mode 100644 maas/machines/wait_for_ready_or_deployed.sls
+
+diff --git a/README.rst b/README.rst
+index 20da43e..78d8aef 100644
+--- a/README.rst
++++ b/README.rst
+@@ -622,12 +622,16 @@ Wait for status of selected machine's:
+ machines:
+ - kvm01
+ - kvm02
+- timeout: 1200 # in seconds
++ timeout: {{ region.timeout.ready }}
++ attempts: {{ region.timeout.attempts }}
+ req_status: "Ready"
+ - require:
+ - cmd: maas_login_admin
+ ...
+
++The timeout setting is taken from the reclass pillar data.
++If the pillar data is not defined, it will use the default value.
++
+ If module run w/\o any extra paremeters,
+ ``wait_for_machines_ready`` will wait for defined in salt
+ machines. In this case, it is usefull to skip some machines:
+@@ -642,7 +646,8 @@ machines. In this case, it is usefull to skip some machines:
+ module.run:
+ - name: maas.wait_for_machine_status
+ - kwargs:
+- timeout: 1200 # in seconds
++ timeout: {{ region.timeout.deployed }}
++ attempts: {{ region.timeout.attempts }}
+ req_status: "Deployed"
+ ignore_machines:
+ - kvm01 # in case it's broken or whatever
+diff --git a/_modules/maas.py b/_modules/maas.py
+index c02f104..28e46c5 100644
+--- a/_modules/maas.py
++++ b/_modules/maas.py
+@@ -921,6 +921,7 @@ class MachinesStatus(MaasObject):
+ req_status: string; Polling status
+ machines: list; machine names
+ ignore_machines: list; machine names
++ attempts: max number of automatic hard retries
+ :ret: True
+ Exception - if something fail/timeout reached
+ """
+@@ -929,6 +930,8 @@ class MachinesStatus(MaasObject):
+ req_status = kwargs.get("req_status", "Ready")
+ to_discover = kwargs.get("machines", None)
+ ignore_machines = kwargs.get("ignore_machines", None)
++ attempts = kwargs.get("attempts", 0)
++ failed_attempts = {}
+ if not to_discover:
+ try:
+ to_discover = __salt__['config.get']('maas')['region'][
+@@ -943,11 +946,45 @@ class MachinesStatus(MaasObject):
+ while len(total) <= len(to_discover):
+ for m in to_discover:
+ for discovered in MachinesStatus.execute()['machines']:
+- if m == discovered['hostname'] and \
+- discovered['status'].lower() == req_status.lower():
+- if m in total:
++ if m == discovered['hostname'] and m in total:
++ req_status_list = req_status.lower().split('|')
++ if discovered['status'].lower() in req_status_list:
+ total.remove(m)
+-
++ elif attempts > 0 and (m not in failed_attempts or
++ failed_attempts[m] < attempts):
++ status = discovered['status']
++ sid = discovered['system_id']
++ cls._maas = _create_maas_client()
++ if status in ['Failed commissioning', 'New']:
++ LOG.info('Machine {0} deleted'.format(sid))
++ cls._maas.delete(u'api/2.0/machines/{0}/'
++ .format(sid))
++ Machine().process()
++ elif status in ['Failed testing']:
++ data = {}
++ LOG.info('Machine {0} overriden'.format(sid))
++ action = 'override_failed_testing'
++ cls._maas.post(u'api/2.0/machines/{0}/'
++ .format(sid), action, **data)
++ elif status in ['Failed deployment', 'Allocated']:
++ data = {}
++ LOG.info('Machine {0} mark broken'.format(sid))
++ cls._maas.post(u'api/2.0/machines/{0}/'
++ .format(sid), 'mark_broken', **data)
++ LOG.info('Machine {0} mark fixed'.format(sid))
++ cls._maas.post(u'api/2.0/machines/{0}/'
++ .format(sid), 'mark_fixed', **data)
++ if m in failed_attempts and failed_attempts[m]:
++ LOG.info('Machine {0} fio test'.format(sid))
++ data['testing_scripts'] = 'fio'
++ cls._maas.post(u'api/2.0/machines/{0}/'
++ .format(sid), 'commission', **data)
++ DeployMachines().process()
++ else:
++ continue
++ if m not in failed_attempts:
++ failed_attempts[m] = 0
++ failed_attempts[m] = failed_attempts[m] + 1
+ if len(total) <= 0:
+ LOG.debug(
+ "Machines:{} are:{}".format(to_discover, req_status))
+@@ -959,7 +996,9 @@ class MachinesStatus(MaasObject):
+ "Waiting status:{} "
+ "for machines:{}"
+ "\nsleep for:{}s "
+- "Timeout:{}s".format(req_status, total, poll_time, timeout))
++ "Timeout:{}s ({}s left)"
++ .format(req_status, total, poll_time, timeout,
++ timeout - (time.time() - started_at)))
+ time.sleep(poll_time)
+
+
+diff --git a/maas/machines/wait_for_deployed.sls b/maas/machines/wait_for_deployed.sls
+index ebeedac..a646fdb 100644
+--- a/maas/machines/wait_for_deployed.sls
++++ b/maas/machines/wait_for_deployed.sls
+@@ -9,5 +9,7 @@ wait_for_machines_deployed:
+ - name: maas.wait_for_machine_status
+ - kwargs:
+ req_status: "Deployed"
++ timeout: {{ region.timeout.deployed }}
++ attempts: {{ region.timeout.attempts }}
+ - require:
+ - cmd: maas_login_admin
+diff --git a/maas/machines/wait_for_ready.sls b/maas/machines/wait_for_ready.sls
+index c5d3c28..d8a2963 100644
+--- a/maas/machines/wait_for_ready.sls
++++ b/maas/machines/wait_for_ready.sls
+@@ -7,5 +7,8 @@ maas_login_admin:
+ wait_for_machines_ready:
+ module.run:
+ - name: maas.wait_for_machine_status
++ - kwargs:
++ timeout: {{ region.timeout.ready }}
++ attempts: {{ region.timeout.attempts }}
+ - require:
+ - cmd: maas_login_admin
+diff --git a/maas/machines/wait_for_ready_or_deployed.sls b/maas/machines/wait_for_ready_or_deployed.sls
+new file mode 100644
+index 0000000..db3dcc4
+--- /dev/null
++++ b/maas/machines/wait_for_ready_or_deployed.sls
+@@ -0,0 +1,15 @@
++{%- from "maas/map.jinja" import region with context %}
++
++maas_login_admin:
++ cmd.run:
++ - name: "maas-region apikey --username {{ region.admin.username }} > /var/lib/maas/.maas_credentials"
++
++wait_for_machines_ready_or_deployed:
++ module.run:
++ - name: maas.wait_for_machine_status
++ - kwargs:
++ req_status: "Ready|Deployed"
++ timeout: {{ region.timeout.ready }}
++ attempts: {{ region.timeout.attempts }}
++ - require:
++ - cmd: maas_login_admin
+diff --git a/maas/map.jinja b/maas/map.jinja
+index 0671435..1e6ac07 100644
+--- a/maas/map.jinja
++++ b/maas/map.jinja
+@@ -22,6 +22,10 @@ Debian:
+ bind:
+ host: 0.0.0.0
+ port: 80
++ timeout:
++ ready: 1200
++ deployed: 7200
++ attempts: 0
+ {%- endload %}
+
+ {%- set region = salt['grains.filter_by'](region_defaults, merge=salt['pillar.get']('maas:region', {})) %}
+diff --git a/tests/pillar/maas_region.sls b/tests/pillar/maas_region.sls
+index d3325eb..d710216 100644
+--- a/tests/pillar/maas_region.sls
++++ b/tests/pillar/maas_region.sls
+@@ -34,3 +34,7 @@ maas:
+ password: password
+ username: maas
+ salt_master_ip: 127.0.0.1
++ timeout:
++ deployed: 900
++ ready: 900
++ attempts: 2