Refactor cellv2 host discovery logic to avoid races
authorOliver Walsh <owalsh@redhat.com>
Fri, 20 Oct 2017 22:27:15 +0000 (23:27 +0100)
committerOliver Walsh <owalsh@redhat.com>
Fri, 10 Nov 2017 09:25:25 +0000 (09:25 +0000)
The compute service list is polled until all expected hosts are reported or a
timeout occurs (600s).

Adds a cellv2_discovery flag to puppet services. Used to generate a list of
hosts that should have cellv2 host mappings.

Adds a canonical fqdn and that should match the fqdn reported by a host.

Adds the ability to upload a config script for docker config instead of using
complex bash on-liners.

Closes-bug: 1720821
Change-Id: I33e2f296526c957cb5f96dff19682a4e60c6a0f0
(cherry picked from commit 61fcfca045aeb5be1ee280d8dd9c260fb39b9084)

common/deploy-steps.j2
common/services.yaml
docker/services/nova-api.yaml
docker/services/nova-compute.yaml
overcloud.j2.yaml
puppet/all-nodes-config.j2.yaml
puppet/role.role.j2.yaml
tools/yaml-validate.py

index 5c923a9..542bf72 100644 (file)
@@ -194,6 +194,7 @@ resources:
                   kolla_config: {get_param: [role_data, {{role.name}}, kolla_config]}
                   bootstrap_server_id: {get_param: [servers, {{primary_role_name}}, '0']}
                   puppet_step_config: {get_param: [role_data, {{role.name}}, step_config]}
+                  docker_config_scripts: {get_param: [role_data, {{role.name}}, docker_config_scripts]}
                 tasks:
                   # Join host_prep_tasks with the other per-host configuration
                   list_concat:
@@ -217,6 +218,11 @@ resources:
                       # FIXME: can we move docker-puppet somewhere so it's installed via a package?
                       - name: Write docker-puppet.py
                         copy: content="{{docker_puppet_script}}" dest=/var/lib/docker-puppet/docker-puppet.py force=yes mode=0600
+                      - name: Create /var/lib/docker-config-scripts
+                        file: path=/var/lib/docker-config-scripts state=directory
+                      - name: Write docker config scripts
+                        copy: content="{{item.value.content}}" dest="/var/lib/docker-config-scripts/{{item.key}}" force=yes mode="{{item.value.mode|default('0600', true)}}"
+                        with_dict: "{{docker_config_scripts}}"
                       # Here we are dumping all the docker container startup configuration data
                       # so that we can have access to how they are started outside of heat
                       # and docker-cmd.  This lets us create command line tools to test containers.
index a0015c7..f706206 100644 (file)
@@ -89,6 +89,31 @@ resources:
             service_names: {get_attr: [ServiceChain, role_data, service_names]}
             docker_config: {get_attr: [ServiceChain, role_data, docker_config]}
 
+  DockerConfigScripts:
+    type: OS::Heat::Value
+    properties:
+      type: json
+      value:
+        yaql:
+          expression:
+            # select 'docker_config_scripts' only from services that have it
+            coalesce($.data.service_names, []).zip(coalesce($.data.docker_config_scripts, [])).where($[1] != null).select($[1]).reduce($1.mergeWith($2), {})
+          data:
+            service_names: {get_attr: [ServiceChain, role_data, service_names]}
+            docker_config_scripts: {get_attr: [ServiceChain, role_data, docker_config_scripts]}
+
+  CellV2Discovery:
+    type: OS::Heat::Value
+    properties:
+      type: boolean
+      value:
+        yaql:
+          expression:
+            # If any service in this role requires cellv2_discovery then this value is true
+            coalesce($.data.cellv2_discovery, []).contains(true)
+          data:
+            cellv2_discovery: {get_attr: [ServiceChain, role_data, cellv2_discovery]}
+
   LoggingSourcesConfig:
     type: OS::Heat::Value
     properties:
@@ -282,5 +307,7 @@ outputs:
       puppet_config: {get_attr: [PuppetConfig, value]}
       kolla_config: {get_attr: [KollaConfig, value]}
       docker_config: {get_attr: [DockerConfig, value]}
+      docker_config_scripts: {get_attr: [DockerConfigScripts, value]}
       docker_puppet_tasks: {get_attr: [DockerPuppetTasks, value]}
       host_prep_tasks: {get_attr: [HostPrepTasks, value]}
+      cellv2_discovery: {get_attr: [CellV2Discovery, value]}
index 7f1b7a5..ee73f70 100644 (file)
@@ -113,6 +113,58 @@ outputs:
             - path: /var/log/nova
               owner: nova:nova
               recurse: true
+      docker_config_scripts:
+        nova_api_discover_hosts.sh:
+          mode: "0700"
+          content: |
+            #!/bin/bash
+            export OS_PROJECT_DOMAIN_NAME=$(crudini --get /etc/nova/nova.conf keystone_authtoken project_domain_name)
+            export OS_USER_DOMAIN_NAME=$(crudini --get /etc/nova/nova.conf keystone_authtoken user_domain_name)
+            export OS_PROJECT_NAME=$(crudini --get /etc/nova/nova.conf keystone_authtoken project_name)
+            export OS_USERNAME=$(crudini --get /etc/nova/nova.conf keystone_authtoken username)
+            export OS_PASSWORD=$(crudini --get /etc/nova/nova.conf keystone_authtoken password)
+            export OS_AUTH_URL=$(crudini --get /etc/nova/nova.conf keystone_authtoken auth_url)
+            export OS_AUTH_TYPE=password
+            export OS_IDENTITY_API_VERSION=3
+
+            echo "(cellv2) Running cell_v2 host discovery"
+            timeout=600
+            loop_wait=30
+            declare -A discoverable_hosts
+            for host in $(hiera -c /etc/puppet/hiera.yaml cellv2_discovery_hosts | sed -e '/^nil$/d' |  tr "," " "); do discoverable_hosts[$host]=1; done
+            timeout_at=$(( $(date +"%s") + ${timeout} ))
+            echo "(cellv2) Waiting ${timeout} seconds for hosts to register"
+            finished=0
+            while : ; do
+              for host in $(openstack -q compute service list -c 'Host' -c 'Zone' -f value | awk '$2 != "internal" { print $1 }'); do
+                if (( discoverable_hosts[$host] == 1 )); then
+                  echo "(cellv2) compute node $host has registered"
+                  unset discoverable_hosts[$host]
+                fi
+              done
+              finished=1
+              for host in "${!discoverable_hosts[@]}"; do
+                if (( ${discoverable_hosts[$host]} == 1 )); then
+                  echo "(cellv2) compute node $host has not registered"
+                  finished=0
+                fi
+              done
+              remaining=$(( $timeout_at - $(date +"%s") ))
+              if (( $finished == 1 )); then
+                echo "(cellv2) All nodes registered"
+                break
+              elif (( $remaining <= 0 )); then
+                echo "(cellv2) WARNING: timeout waiting for nodes to register, running host discovery regardless"
+                echo "(cellv2) Expected host list:" $(hiera -c /etc/puppet/hiera.yaml cellv2_discovery_hosts | sed -e '/^nil$/d' | sort -u |  tr ',' ' ')
+                echo "(cellv2) Detected host list:" $(openstack -q compute service list -c 'Host' -c 'Zone' -f value | awk '$2 != "internal" { print $1 }' | sort -u | tr '\n', ' ')
+                break
+              else
+                echo "(cellv2) Waiting ${remaining} seconds for hosts to register"
+                sleep $loop_wait
+              fi
+            done
+            echo "(cellv2) Running host discovery..."
+            su nova -s /bin/bash -c "/usr/bin/nova-manage cell_v2 discover_hosts --verbose"
       docker_config:
         # db sync runs before permissions set by kolla_config
         step_2:
@@ -223,9 +275,16 @@ outputs:
             image: *nova_api_image
             net: host
             detach: false
-            volumes: *nova_api_bootstrap_volumes
+            volumes:
+              list_concat:
+                - *nova_api_bootstrap_volumes
+                -
+                  - /var/lib/config-data/nova/etc/my.cnf.d/tripleo.cnf:/etc/my.cnf.d/tripleo.cnf:ro
+                  - /var/lib/config-data/nova/etc/nova/:/etc/nova/:ro
+                  - /var/log/containers/nova:/var/log/nova
+                  - /var/lib/docker-config-scripts/nova_api_discover_hosts.sh:/nova_api_discover_hosts.sh:ro
             user: root
-            command: "/usr/bin/bootstrap_host_exec nova_api su nova -s /bin/bash -c '/usr/bin/nova-manage cell_v2 discover_hosts --verbose'"
+            command: "/usr/bin/bootstrap_host_exec nova_api /nova_api_discover_hosts.sh"
       metadata_settings:
         get_attr: [NovaApiBase, role_data, metadata_settings]
       host_prep_tasks:
index b43193e..6db9e58 100644 (file)
@@ -73,6 +73,7 @@ outputs:
     description: Role data for the Nova Compute service.
     value:
       service_name: {get_attr: [NovaComputeBase, role_data, service_name]}
+      cellv2_discovery: true
       config_settings:
         get_attr: [NovaComputeBase, role_data, config_settings]
       logging_source: {get_attr: [NovaComputeBase, role_data, logging_source]}
@@ -111,7 +112,6 @@ outputs:
               owner: nova:nova
               recurse: true
       docker_config:
-        # FIXME: run discover hosts here
         step_4:
           nova_compute:
             image: &nova_compute_image {get_param: DockerNovaComputeImage}
index 3506fe8..9ea195d 100644 (file)
@@ -642,6 +642,21 @@ resources:
 {% for role in roles %}
           - {get_attr: [{{role.name}}ServiceNames, value]}
 {% endfor %}
+      cellv2_discovery_hosts:
+        # Collects compute hostnames for all roles with a service that requires cellv2 host discovery
+        list_join:
+          - ','
+          - yaql:
+              expression: coalesce($.data.e.zip($.data.l).where($[0]).select($[1]).flatten(),  [])
+              data:
+                e: # list of true/fails for whether cellsv2 host discovery is required for the roles
+{%- for role in roles %}
+                  - {get_attr: [{{role.name}}ServiceChainRoleData, value, cellv2_discovery]}
+{%- endfor %}
+                l: # list of list of compute hostnames for the roles
+{%- for role in roles %}
+                  - {get_attr: [{{role.name}}, hostname_map, canonical]}
+{%- endfor %}
       controller_ips: {get_attr: [{{primary_role_name}}, ip_address]}
       controller_names: {get_attr: [{{primary_role_name}}, hostname]}
       service_ips:
index bdd2bcf..6594962 100644 (file)
@@ -22,6 +22,8 @@ parameters:
     type: json
   controller_names:
     type: comma_delimited_list
+  cellv2_discovery_hosts:
+    type: comma_delimited_list
   NetVipMap:
     type: json
   RedisVirtualIP:
@@ -141,6 +143,10 @@ resources:
                   list_join:
                   - ','
                   - {get_param: controller_names}
+              - cellv2_discovery_hosts:
+                  list_join:
+                  - ','
+                  - {get_param: cellv2_discovery_hosts}
                 deploy_identifier: {get_param: DeployIdentifier}
                 update_identifier: {get_param: UpdateIdentifier}
                 stack_action: {get_param: StackAction}
index d53afd0..a3cbe85 100644 (file)
@@ -477,6 +477,14 @@ resources:
             - '.'
             - - {get_attr: [{{server_resource_name}}, name]}
               - ctlplane
+        canonical:
+          fqdn:
+            list_join:
+            - '.'
+            - - {get_attr: [{{server_resource_name}}, name]}
+              - {get_param: CloudDomain}
+          short:
+            - {get_attr: [{{server_resource_name}}, name]}
 
   PreNetworkConfig:
     type: OS::TripleO::{{role.name}}::PreNetworkConfig
@@ -602,6 +610,7 @@ resources:
             fqdn_management: {get_attr: [NetHostMap, value, management, fqdn]}
             fqdn_ctlplane: {get_attr: [NetHostMap, value, ctlplane, fqdn]}
             fqdn_external: {get_attr: [NetHostMap, value, external, fqdn]}
+            fqdn_canonical: {get_attr: [NetHostMap, value, canonical, fqdn]}
 
   # Resource for site-specific injection of root certificate
   NodeTLSCAData:
@@ -696,6 +705,7 @@ outputs:
       {{network.name_lower|default(network.name.lower())}}: {get_attr: [NetHostMap, value, {{network.name_lower|default(network.name.lower()) }}, fqdn]}
   {%- endfor %}
       ctlplane: {get_attr: [NetHostMap, value, ctlplane, fqdn]}
+      canonical: {get_attr: [NetHostMap, value, canonical, fqdn]}
   hosts_entry:
     value:
       str_replace:
index 76f856d..9279f1d 100755 (executable)
@@ -31,14 +31,15 @@ envs_containing_endpoint_map = ['tls-endpoints-public-dns.yaml',
                                 'tls-endpoints-public-ip.yaml',
                                 'tls-everywhere-endpoints-dns.yaml']
 ENDPOINT_MAP_FILE = 'endpoint_map.yaml'
-OPTIONAL_SECTIONS = ['workflow_tasks']
+OPTIONAL_SECTIONS = ['workflow_tasks', 'cellv2_discovery']
 REQUIRED_DOCKER_SECTIONS = ['service_name', 'docker_config', 'puppet_config',
                             'config_settings', 'step_config']
 OPTIONAL_DOCKER_SECTIONS = ['docker_puppet_tasks', 'upgrade_tasks',
                             'post_upgrade_tasks', 'update_tasks',
                             'service_config_settings',
                             'host_prep_tasks', 'metadata_settings',
-                            'kolla_config', 'logging_source', 'logging_groups']
+                            'kolla_config', 'logging_source',
+                            'logging_groups', 'docker_config_scripts']
 REQUIRED_DOCKER_PUPPET_CONFIG_SECTIONS = ['config_volume', 'step_config',
                                           'config_image']
 OPTIONAL_DOCKER_PUPPET_CONFIG_SECTIONS = [ 'puppet_tags', 'volumes' ]