`maas_fixup` is already re-entrant, so we can execute it more than
once during a commissioning/deploy cycle. Reduce the timeout waiting
for all nodes to reach a stable state, so nodes stuck in 'Ready'
state instead of reaching 'Deploying' get dealt with sooner (~5 min
vs old 30 min).
While at it, let `maas_fixup` handle machine deploy as well, so we
can catch nodes stuck in 'Ready' state and re-trigger the deploy.
Change-Id: Id24cc97b17489835c5846288639a9a6032bd320a
Signed-off-by: Alexandru Avadanii <Alexandru.Avadanii@enea.com>
(cherry picked from commit
8da73521d3b9347a982ea6e77114bba0d0f0adeb)
local statuscmd="salt 'mas01*' --out yaml state.apply maas.machines.status"
local ncount=$(salt --out yaml 'mas01*' pillar.get maas:region:machines | \
grep -cE '^\s{2}\w+:$')
local statuscmd="salt 'mas01*' --out yaml state.apply maas.machines.status"
local ncount=$(salt --out yaml 'mas01*' pillar.get maas:region:machines | \
grep -cE '^\s{2}\w+:$')
- wait_for 180 "${statuscmd} | tee /dev/stderr | " \
+ wait_for 30 "${statuscmd} | tee /dev/stderr | " \
"grep -Eq '((Deployed|Ready): ${ncount}|status: (Failed|Allocated))'"
local statusout=$(eval "${statuscmd}")
"grep -Eq '((Deployed|Ready): ${ncount}|status: (Failed|Allocated))'"
local statusout=$(eval "${statuscmd}")
for node_system_id in ${fcnodes}; do
salt -C 'mas01*' state.apply maas.machines.delete \
pillar="{'system_id': '${node_system_id}'}"
for node_system_id in ${fcnodes}; do
salt -C 'mas01*' state.apply maas.machines.delete \
pillar="{'system_id': '${node_system_id}'}"
done
if [ -n "${fcnodes}" ]; then
salt -C 'mas01*' state.apply maas.machines
done
if [ -n "${fcnodes}" ]; then
salt -C 'mas01*' state.apply maas.machines
local fdnodes=$(echo "${statusout}" | \
grep -Pzo 'status: (Failed deployment|Allocated)\n\s+system_id: \K.+\n')
local fdnodes=$(echo "${statusout}" | \
grep -Pzo 'status: (Failed deployment|Allocated)\n\s+system_id: \K.+\n')
+ local rnodes=$(echo "${statusout}" | \
+ grep -Pzo 'status: Ready\n\s+system_id: \K.+\n')
for node_system_id in ${fdnodes}; do
salt -C 'mas01*' state.apply maas.machines.mark_broken_fixed \
pillar="{'system_id': '${node_system_id}'}"
for node_system_id in ${fdnodes}; do
salt -C 'mas01*' state.apply maas.machines.mark_broken_fixed \
pillar="{'system_id': '${node_system_id}'}"
- if [ -n "${fdnodes}" ]; then
+ if [ -n "${fdnodes}" ] || [ -n "${rnodes}" ]; then
salt -C 'mas01*' state.apply maas.machines.deploy
return 1
fi
salt -C 'mas01*' state.apply maas.machines.deploy
return 1
fi
wait_for 10 "salt -C 'mas01*' state.apply maas.region"
salt -C 'mas01*' state.apply maas.machines
wait_for 10 "salt -C 'mas01*' state.apply maas.region"
salt -C 'mas01*' state.apply maas.machines
+# MaaS node deployment
+wait_for 20 maas_fixup
# cleanup outdated salt keys
salt-key --out yaml | awk '!/^(minions|- cfg01|- mas01)/ {print $2}' | \
xargs -I{} salt-key -yd {}
# cleanup outdated salt keys
salt-key --out yaml | awk '!/^(minions|- cfg01|- mas01)/ {print $2}' | \
xargs -I{} salt-key -yd {}
-# MaaS node deployment
-salt -C 'mas01*' state.apply maas.machines.deploy
-wait_for 10 maas_fixup
-
salt -C 'mas01*' pillar.item\
maas:region:admin:username \
maas:region:admin:password
salt -C 'mas01*' pillar.item\
maas:region:admin:username \
maas:region:admin:password
+maas login {{ region.admin.username }} \
+ http://{{ region.bind.host }}:5240/MAAS/api/2.0 - < \
+ /var/lib/maas/.maas_credentials || exit 1
+maas login {{ region.admin.username }} \
+ http://{{ region.bind.host }}:5240/MAAS/api/2.0 - < \
+ /var/lib/maas/.maas_credentials || exit 1
-+# wait max 15 min for service up / image download, 5 min region to rack sync
-+wait_for 90 "grep -qzE '(Unable to probe for DHCP servers|DHCP probe complete).*Rack controller' /var/log/maas/rackd.log"
++# wait max 5 min for service up, 15 min image download, 5 min region to rack sync
++wait_for 30 "grep -qzE '(Unable to probe for DHCP servers|DHCP probe complete).*Rack controller' /var/log/maas/rackd.log"
+maas opnfv boot-resources import || exit 2
+wait_for 90 "! maas opnfv boot-resources is-importing | grep -q 'true'"
+maas opnfv rack-controllers import-boot-images || exit 3
+maas opnfv boot-resources import || exit 2
+wait_for 90 "! maas opnfv boot-resources is-importing | grep -q 'true'"
+maas opnfv rack-controllers import-boot-images || exit 3
# TODO: implement mark_broken_fixed_machine via _modules/maas.py
mark_broken_fixed_machine:
cmd.run:
# TODO: implement mark_broken_fixed_machine via _modules/maas.py
mark_broken_fixed_machine:
cmd.run:
- - name: "maas login {{ region.admin.username }} http://{{ region.bind.host }}:5240/MAAS/api/2.0 - < /var/lib/maas/.maas_credentials && maas opnfv machine mark-broken {{ pillar['system_id'] }} && sleep 30 && maas opnfv machine mark-fixed {{ pillar['system_id'] }}"
+ - name: "maas login {{ region.admin.username }} http://{{ region.bind.host }}:5240/MAAS/api/2.0 - < /var/lib/maas/.maas_credentials && maas opnfv machine mark-broken {{ pillar['system_id'] }} && sleep 10 && maas opnfv machine mark-fixed {{ pillar['system_id'] }}"
- require:
- cmd: maas_login_admin
- require:
- cmd: maas_login_admin