Add recovery scripts 25/23125/7
authorQiLiang <liangqi1@huawei.com>
Thu, 13 Oct 2016 22:11:07 +0000 (06:11 +0800)
committerQiLiang <liangqi1@huawei.com>
Fri, 14 Oct 2016 21:28:32 +0000 (05:28 +0800)
- add recovery shell scripts
- add boot-recovery role to stop services during system boot which
  could cause boot pending
- configure nfs mount in /etc/fstab cause system boot pending,
  so mount nfs during ansible-playbook run.
- kill mysqld before mysql recovery, running mysqld may cause mysql
  recover failure

JIRA: COMPASS-474

Change-Id: I0f6f0ee935fbe3fbbe28a451a02decfb01a6165b
Signed-off-by: QiLiang <liangqi1@huawei.com>
23 files changed:
deploy/adapters/ansible/openstack/HA-ansible-multinodes.yml
deploy/adapters/ansible/openstack_mitaka/HA-ansible-multinodes.yml
deploy/adapters/ansible/openstack_mitaka/roles/glance/tasks/nfs.yml
deploy/adapters/ansible/openstack_mitaka_xenial/HA-ansible-multinodes.yml
deploy/adapters/ansible/openstack_mitaka_xenial/roles/glance/tasks/nfs.yml
deploy/adapters/ansible/openstack_newton_xenial/HA-ansible-multinodes.yml
deploy/adapters/ansible/roles/boot-recovery/tasks/main.yml [new file with mode: 0755]
deploy/adapters/ansible/roles/boot-recovery/vars/Debian.yml [new file with mode: 0755]
deploy/adapters/ansible/roles/boot-recovery/vars/RedHat.yml [new file with mode: 0755]
deploy/adapters/ansible/roles/boot-recovery/vars/main.yml [new file with mode: 0755]
deploy/adapters/ansible/roles/controller-recovery/vars/Debian.yml
deploy/adapters/ansible/roles/controller-recovery/vars/RedHat.yml
deploy/adapters/ansible/roles/database/tasks/mariadb_cluster_debian.yml
deploy/adapters/ansible/roles/database/tasks/mariadb_cluster_redhat.yml
deploy/adapters/ansible/roles/glance/tasks/main.yml
deploy/adapters/ansible/roles/glance/tasks/nfs.yml
deploy/compass_vm.sh
deploy/host_virtual.sh
deploy/launch.sh
deploy/network.sh
deploy/recovery.sh [new file with mode: 0644]
deploy/template/power/ipmitool.tmpl
recovery.sh [new file with mode: 0755]

index 7f61a1c..95102d2 100644 (file)
   roles:
     - ext-network
 
+- hosts: controller
+  remote_user: root
+  accelerate: true
+  max_fail_percentage: 0
+  roles:
+    - boot-recovery
+
 - hosts: controller
   remote_user: root
   accelerate: true
index 7ef467e..c04445d 100644 (file)
   roles:
     - tacker
 
+- hosts: controller
+  remote_user: root
+  accelerate: true
+  max_fail_percentage: 0
+  roles:
+    - boot-recovery
+
 - hosts: controller
   remote_user: root
   accelerate: true
index 07dfacd..deec81f 100644 (file)
 - name: get mount info
   command: mount
   register: mount_info
+  tags:
+    - recovery
 
 - name: get nfs server
   shell: awk -F'=' '/compass_server/ {print $2}' /etc/compass.conf
   register: ip_info
+  tags:
+    - recovery
 
 - name: restart host nfs service
   service: name={{ item }} state=restarted enabled=yes
@@ -55,7 +59,9 @@
   shell: |
     mount -t nfs  -onfsvers=3 {{ ip_info.stdout_lines[0] }}:/opt/images /var/lib/glance/images
     sed -i '/\/var\/lib\/glance\/images/d' /etc/fstab
-    echo {{ ip_info.stdout_lines[0] }}:/opt/images /var/lib/glance/images/ nfs nfsvers=3 >> /etc/fstab
+    #echo {{ ip_info.stdout_lines[0] }}:/opt/images /var/lib/glance/images/ nfs nfsvers=3 >> /etc/fstab
   when: mount_info.stdout.find('images') == -1
   retries: 5
   delay: 3
+  tags:
+    - recovery
index ec4c53f..ac31b68 100644 (file)
   roles:
     - ext-network
 
+- hosts: controller
+  remote_user: root
+  accelerate: true
+  max_fail_percentage: 0
+  roles:
+    - boot-recovery
+
 - hosts: controller
   remote_user: root
   accelerate: true
index 07dfacd..deec81f 100644 (file)
 - name: get mount info
   command: mount
   register: mount_info
+  tags:
+    - recovery
 
 - name: get nfs server
   shell: awk -F'=' '/compass_server/ {print $2}' /etc/compass.conf
   register: ip_info
+  tags:
+    - recovery
 
 - name: restart host nfs service
   service: name={{ item }} state=restarted enabled=yes
@@ -55,7 +59,9 @@
   shell: |
     mount -t nfs  -onfsvers=3 {{ ip_info.stdout_lines[0] }}:/opt/images /var/lib/glance/images
     sed -i '/\/var\/lib\/glance\/images/d' /etc/fstab
-    echo {{ ip_info.stdout_lines[0] }}:/opt/images /var/lib/glance/images/ nfs nfsvers=3 >> /etc/fstab
+    #echo {{ ip_info.stdout_lines[0] }}:/opt/images /var/lib/glance/images/ nfs nfsvers=3 >> /etc/fstab
   when: mount_info.stdout.find('images') == -1
   retries: 5
   delay: 3
+  tags:
+    - recovery
index 3d5b0a1..9e8ec15 100644 (file)
   roles:
     - ext-network
 
+- hosts: controller
+  remote_user: root
+  accelerate: true
+  max_fail_percentage: 0
+  roles:
+    - boot-recovery
+
 - hosts: controller
   remote_user: root
   accelerate: true
diff --git a/deploy/adapters/ansible/roles/boot-recovery/tasks/main.yml b/deploy/adapters/ansible/roles/boot-recovery/tasks/main.yml
new file mode 100755 (executable)
index 0000000..67206bf
--- /dev/null
@@ -0,0 +1,26 @@
+##############################################################################
+# Copyright (c) 2016 HUAWEI TECHNOLOGIES CO.,LTD and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+---
+- name: Register RECOVERY
+  set_fact: RECOVERY_ENV={{RECOVERY_ENV | default('False')}}
+  tags:
+    - recovery-stop-service
+
+- include_vars: "{{ ansible_os_family }}.yml"
+  when: RECOVERY_ENV
+  tags:
+    - recovery-stop-service
+
+- name: stop controller services
+  service: name={{ item }} state=stopped enabled=yes
+  with_items: controller_services | union(controller_services_noarch)
+  when: RECOVERY_ENV
+  tags:
+    - recovery-stop-service
+
diff --git a/deploy/adapters/ansible/roles/boot-recovery/vars/Debian.yml b/deploy/adapters/ansible/roles/boot-recovery/vars/Debian.yml
new file mode 100755 (executable)
index 0000000..084deeb
--- /dev/null
@@ -0,0 +1,14 @@
+##############################################################################
+# Copyright (c) 2016 HUAWEI TECHNOLOGIES CO.,LTD and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+---
+controller_services:
+    - cron
+    - aodh-expirer
+    - neutron-openvswitch-agent
+    - mysql
diff --git a/deploy/adapters/ansible/roles/boot-recovery/vars/RedHat.yml b/deploy/adapters/ansible/roles/boot-recovery/vars/RedHat.yml
new file mode 100755 (executable)
index 0000000..c46f79c
--- /dev/null
@@ -0,0 +1,15 @@
+##############################################################################
+# Copyright (c) 2016 HUAWEI TECHNOLOGIES CO.,LTD and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+---
+controller_services:
+    - cron
+    - neutron-openvswitch-agent
+    - openstack-aodh-expirer
+    - mysql
+
diff --git a/deploy/adapters/ansible/roles/boot-recovery/vars/main.yml b/deploy/adapters/ansible/roles/boot-recovery/vars/main.yml
new file mode 100755 (executable)
index 0000000..22af29f
--- /dev/null
@@ -0,0 +1,11 @@
+##############################################################################
+# Copyright (c) 2016 HUAWEI TECHNOLOGIES CO.,LTD and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+---
+controller_services_noarch: []
+
index 35c0a95..145acec 100644 (file)
@@ -36,4 +36,5 @@ controller_services:
     - openstack-aodh-notifier
     - openstack-aodh-evaluator
     - openstack-aodh-listener
+    - cron
 
index 6b67031..f083a40 100644 (file)
@@ -7,6 +7,14 @@
 # http://www.apache.org/licenses/LICENSE-2.0
 ##############################################################################
 ---
+- name: Register RECOVERY
+  set_fact: RECOVERY_ENV={{RECOVERY_ENV | default('False')}}
+
+- name: killall mysqld processes
+  shell: sudo killall -9  mysqld
+  when: RECOVERY_ENV
+  ignore_errors: True
+
 - name: get cluster status
   shell: mysql --silent --skip-column-names -e 'SHOW STATUS LIKE "wsrep_evs_state"'|awk '{print $2}'
   register: cluster_status
index da1b863..cfd778f 100644 (file)
@@ -7,6 +7,14 @@
 # http://www.apache.org/licenses/LICENSE-2.0
 ##############################################################################
 ---
+- name: Register RECOVERY
+  set_fact: RECOVERY_ENV={{RECOVERY_ENV | default('False')}}
+
+- name: killall mysqld processes
+  shell: sudo killall -9 mysqld
+  when: RECOVERY_ENV
+  ignore_errors: True
+
 - name: get cluster status
   shell: mysql --silent --skip-column-names -e 'SHOW STATUS LIKE "wsrep_evs_state"'|awk '{print $2}'
   register: cluster_status
index a78ba77..caece26 100644 (file)
@@ -8,6 +8,8 @@
 ##############################################################################
 ---
 - include_vars: "{{ ansible_os_family }}.yml"
+  tags:
+    - recovery
 
 - include: glance_install.yml
   tags:
index 7895c38..179229d 100644 (file)
 - name: get mount info
   command: mount
   register: mount_info
+  tags:
+    - recovery
 
 - name: get nfs server
   shell: awk -F'=' '/compass_server/ {print $2}' /etc/compass.conf
   register: ip_info
+  tags:
+    - recovery
 
 - name: restart host nfs service
   service: name={{ item }} state=restarted enabled=yes
@@ -51,7 +55,9 @@
   shell: |
     mount -t nfs  -onfsvers=3 {{ ip_info.stdout_lines[0] }}:/opt/images /var/lib/glance/images
     sed -i '/\/var\/lib\/glance\/images/d' /etc/fstab
-    echo {{ ip_info.stdout_lines[0] }}:/opt/images /var/lib/glance/images/ nfs nfsvers=3 >> /etc/fstab
+    #echo {{ ip_info.stdout_lines[0] }}:/opt/images /var/lib/glance/images/ nfs nfsvers=3 >> /etc/fstab
   when: mount_info.stdout.find('images') == -1
   retries: 5
   delay: 3
+  tags:
+    - recovery
index dc391ac..7e2ce40 100755 (executable)
@@ -151,3 +151,75 @@ function launch_compass() {
     set +e
     log_info "launch_compass exit"
 }
+
+function recover_compass() {
+    log_info "recover_compass enter"
+
+    sudo virsh start compass
+
+    if ! wait_ok 500;then
+        log_error "install os timeout"
+        exit 1
+    fi
+
+    log_info "launch_compass exit"
+}
+
+function _check_hosts_reachable() {
+    retry=0
+
+    while true; do
+        sleep 1
+        let retry+=1
+        if [[ $retry -ge $1 ]]; then
+            log_error "hosts boot time out"
+            echo "fail"
+            return
+        fi
+
+        ssh $ssh_args root@$MGMT_IP "
+            cd /var/ansible/run/$ADAPTER_NAME'-'$CLUSTER_NAME;
+            ansible -i inventories/inventory.yml $2 -m ping
+        " > /dev/null
+        if [ $? == 0 ]; then
+            break
+        fi
+    done
+    echo "ok"
+}
+
+function check_hosts_reachable() {
+    ret=$(_check_hosts_reachable $1 compute)
+    if [[ "$ret" == "fail" ]]; then
+        echo $ret
+        return
+    fi
+
+    ret=$(_check_hosts_reachable 100 controller)
+    echo $ret
+}
+
+function recover_hosts() {
+    ssh $ssh_args root@$MGMT_IP "
+        cd /var/ansible/run/$ADAPTER_NAME'-'$CLUSTER_NAME;
+        ansible-playbook \
+            -i inventories/inventory.yml HA-ansible-multinodes.yml \
+            -t recovery \
+            -e 'RECOVERY_ENV=True'
+    "
+    if [ $? == 0 ]; then
+        echo "Recovery Complete!"
+    fi
+}
+
+function wait_controller_nodes_ok() {
+    sleep 100
+    ssh $ssh_args root@$MGMT_IP "
+        cd /var/ansible/run/$ADAPTER_NAME'-'$CLUSTER_NAME;
+        ansible-playbook \
+            -i inventories/inventory.yml HA-ansible-multinodes.yml \
+            -t recovery-stop-service \
+            -e 'RECOVERY_ENV=True'
+    "
+    sleep 30
+}
index 2fab2c9..0a991f1 100755 (executable)
@@ -54,6 +54,19 @@ function launch_host_vms() {
     IFS=$old_ifs
 }
 
+function recover_host_vms() {
+    old_ifs=$IFS
+    IFS=,
+
+    for host in $HOSTNAMES; do
+        sudo virsh destroy $host
+        sleep 2
+        sudo virsh start $host
+        sleep 2
+    done
+    IFS=$old_ifs
+}
+
 function get_host_macs() {
     local mac_generator=${COMPASS_DIR}/deploy/mac_generator.sh
     local machines=
index 976af3c..6db9f36 100755 (executable)
@@ -12,7 +12,8 @@ WORK_DIR=$COMPASS_DIR/work/deploy
 
 mkdir -p $WORK_DIR/script
 
-export DEPLOY_FIRST_TIME=${DEPLOY_FIRST_TIME-"true"}
+export DEPLOY_FIRST_TIME=${DEPLOY_FIRST_TIME:-"true"}
+export DEPLOY_RECOVERY=${DEPLOY_RECOVERY:-"false"}
 
 source ${COMPASS_DIR}/deploy/prepare.sh
 prepare_python_env
@@ -31,9 +32,14 @@ source ${COMPASS_DIR}/deploy/compass_vm.sh
 source ${COMPASS_DIR}/deploy/deploy_host.sh
 
 ######################### main process
-if [[ "$EXPANSION" == "false" ]]
-then
 
+if [[ "$DEPLOY_RECOVERY"  == "true" ]]; then
+    source ${COMPASS_DIR}/deploy/recovery.sh
+    recover_cluster
+    exit 0
+fi
+
+if [[ "$EXPANSION" == "false" ]]; then
     print_logo
 
     if [[ ! -z $VIRT_NUMBER ]];then
index 46b8c02..6c67822 100755 (executable)
@@ -29,6 +29,13 @@ function setup_bridge_net()
     sudo virsh net-start $net_name
 }
 
+function recover_bridge_net()
+{
+    net_name=$1
+
+    sudo virsh net-start $net_name
+}
+
 function save_network_info()
 {
     sudo ovs-vsctl list-br |grep br-external
@@ -69,6 +76,13 @@ function setup_bridge_external()
     python $COMPASS_DIR/deploy/setup_vnic.py
 }
 
+function recover_bridge_external()
+{
+    sudo virsh net-start external
+
+    python $COMPASS_DIR/deploy/setup_vnic.py
+}
+
 function setup_nat_net() {
     net_name=$1
     gw=$2
@@ -92,11 +106,20 @@ function setup_nat_net() {
     sudo virsh net-start $net_name
 }
 
+function recover_nat_net() {
+    net_name=$1
+
+    sudo virsh net-start $net_name
+}
 
 function setup_virtual_net() {
   setup_nat_net install $INSTALL_GW $INSTALL_MASK
 }
 
+function recover_virtual_net() {
+  recover_nat_net install
+}
+
 function setup_baremetal_net() {
   if [[ -z $INSTALL_NIC ]]; then
     exit 1
@@ -104,6 +127,13 @@ function setup_baremetal_net() {
   setup_bridge_net install $INSTALL_NIC
 }
 
+function recover_baremetal_net() {
+  if [[ -z $INSTALL_NIC ]]; then
+    exit 1
+  fi
+  recover_bridge_net install
+}
+
 function setup_network_boot_scripts() {
     sudo cp $COMPASS_DIR/deploy/network.sh /usr/sbin/network_setup
     sudo chmod +777 /usr/sbin/network_setup
@@ -134,3 +164,14 @@ function create_nets() {
     setup_network_boot_scripts
 }
 
+function recover_nets() {
+    recover_nat_net mgmt
+
+    # recover install network
+    recover_"$TYPE"_net
+
+    # recover external network
+    recover_bridge_external
+    clear_forward_rejct_rules
+}
+
diff --git a/deploy/recovery.sh b/deploy/recovery.sh
new file mode 100644 (file)
index 0000000..db85848
--- /dev/null
@@ -0,0 +1,40 @@
+#!/bin/bash
+##############################################################################
+# Copyright (c) 2016 HUAWEI TECHNOLOGIES CO.,LTD and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+
+function recover_cluster() {
+    recover_nets
+    recover_compass
+
+    i=0
+    MAX_RETRY_TIMES=2
+    while [ $i -lt $MAX_RETRY_TIMES ]; do
+        let i+=1
+
+        if [[ ! -z $VIRT_NUMBER ]];then
+            recover_host_vms
+        else
+            reboot_hosts
+        fi
+
+        ret=$(check_hosts_reachable 500)
+        if [[ "$ret" == "ok" ]];then
+            break
+        fi
+    done
+
+    if [[ $i -ge $MAX_RETRY_TIMES ]]; then
+        echo "Recovery Failure !!!"
+        exit 1
+    fi
+
+    wait_controller_nodes_ok
+    recover_hosts
+}
+
index a297e00..048e997 100644 (file)
@@ -40,19 +40,23 @@ for i in {1..5}; do
     fi
 done
 sleep 1
-for i in {1..5}; do
-    if ipmitool -I $interface -H $ipmiIp -U $ipmiUser -P $ipmiPass chassis bootdev pxe >/dev/null 2>&1
-    then
-        break
-    elif [[ i -lt 5 ]]
-    then
-        sleep 1
-    else
-        log_error "set $ipmiIp pxe fail"
-        exit 1
-    fi
-done
-sleep 1
+
+if [[ "\$DEPLOY_RECOVERY"  != "true" ]]; then
+    for i in {1..5}; do
+        if ipmitool -I $interface -H $ipmiIp -U $ipmiUser -P $ipmiPass chassis bootdev pxe >/dev/null 2>&1
+        then
+            break
+        elif [[ i -lt 5 ]]
+        then
+            sleep 1
+        else
+            log_error "set $ipmiIp pxe fail"
+            exit 1
+        fi
+    done
+    sleep 1
+fi
+
 for i in {1..5}; do
     if ipmitool -I $interface -H $ipmiIp -U $ipmiUser -P $ipmiPass chassis power reset >/dev/null 2>&1
     then
diff --git a/recovery.sh b/recovery.sh
new file mode 100755 (executable)
index 0000000..1b18862
--- /dev/null
@@ -0,0 +1,15 @@
+#!/bin/bash
+##############################################################################
+# Copyright (c) 2016 HUAWEI TECHNOLOGIES CO.,LTD and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+
+export DEPLOY_RECOVERY="true"
+export DEPLOY_FIRST_TIME="false"
+
+./run.sh
+