src/ceph/qa/tasks/systemd.py

   1 """
   2 Systemd test
   3 """
   4 import contextlib
   5 import logging
   6 import re
   7 import time
   8
   9 from cStringIO import StringIO
  10 from teuthology.orchestra import run
  11 from teuthology.misc import reconnect, get_first_mon, wait_until_healthy
  12
  13 log = logging.getLogger(__name__)
  14
  15
  16 @contextlib.contextmanager
  17 def task(ctx, config):
  18     """
  19       - tasks:
  20           ceph-deploy:
  21           systemd:
  22
  23     Test ceph systemd services can start, stop and restart and
  24     check for any failed services and report back errors
  25     """
  26     for remote, roles in ctx.cluster.remotes.iteritems():
  27         remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
  28                          'grep', 'ceph'])
  29         r = remote.run(args=['sudo', 'systemctl', 'list-units', run.Raw('|'),
  30                              'grep', 'ceph'], stdout=StringIO(),
  31                        check_status=False)
  32         log.info(r.stdout.getvalue())
  33         if r.stdout.getvalue().find('failed'):
  34             log.info("Ceph services in failed state")
  35
  36         # test overall service stop and start using ceph.target
  37         # ceph.target tests are meant for ceph systemd tests
  38         # and not actual process testing using 'ps'
  39         log.info("Stopping all Ceph services")
  40         remote.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'])
  41         r = remote.run(args=['sudo', 'systemctl', 'status', 'ceph.target'],
  42                        stdout=StringIO(), check_status=False)
  43         log.info(r.stdout.getvalue())
  44         log.info("Checking process status")
  45         r = remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
  46                              'grep', 'ceph'], stdout=StringIO())
  47         if r.stdout.getvalue().find('Active: inactive'):
  48             log.info("Sucessfully stopped all ceph services")
  49         else:
  50             log.info("Failed to stop ceph services")
  51
  52         log.info("Starting all Ceph services")
  53         remote.run(args=['sudo', 'systemctl', 'start', 'ceph.target'])
  54         r = remote.run(args=['sudo', 'systemctl', 'status', 'ceph.target'],
  55                        stdout=StringIO())
  56         log.info(r.stdout.getvalue())
  57         if r.stdout.getvalue().find('Active: active'):
  58             log.info("Sucessfully started all Ceph services")
  59         else:
  60             log.info("info", "Failed to start Ceph services")
  61         r = remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
  62                              'grep', 'ceph'], stdout=StringIO())
  63         log.info(r.stdout.getvalue())
  64         time.sleep(4)
  65
  66         # test individual services start stop
  67         name = remote.shortname
  68         mon_name = 'ceph-mon@' + name + '.service'
  69         mds_name = 'ceph-mds@' + name + '.service'
  70         mgr_name = 'ceph-mgr@' + name + '.service'
  71         mon_role_name = 'mon.' + name
  72         mds_role_name = 'mds.' + name
  73         mgr_role_name = 'mgr.' + name
  74         m_osd = re.search('--id (\d+) --setuser ceph', r.stdout.getvalue())
  75         if m_osd:
  76             osd_service = 'ceph-osd@{m}.service'.format(m=m_osd.group(1))
  77             remote.run(args=['sudo', 'systemctl', 'status',
  78                              osd_service])
  79             remote.run(args=['sudo', 'systemctl', 'stop',
  80                              osd_service])
  81             time.sleep(4)  # immediate check will result in deactivating state
  82             r = remote.run(args=['sudo', 'systemctl', 'status', osd_service],
  83                            stdout=StringIO(), check_status=False)
  84             log.info(r.stdout.getvalue())
  85             if r.stdout.getvalue().find('Active: inactive'):
  86                 log.info("Sucessfully stopped single osd ceph service")
  87             else:
  88                 log.info("Failed to stop ceph osd services")
  89             remote.run(args=['sudo', 'systemctl', 'start',
  90                              osd_service])
  91             time.sleep(4)
  92         if mon_role_name in roles:
  93             remote.run(args=['sudo', 'systemctl', 'status', mon_name])
  94             remote.run(args=['sudo', 'systemctl', 'stop', mon_name])
  95             time.sleep(4)  # immediate check will result in deactivating state
  96             r = remote.run(args=['sudo', 'systemctl', 'status', mon_name],
  97                            stdout=StringIO(), check_status=False)
  98             if r.stdout.getvalue().find('Active: inactive'):
  99                 log.info("Sucessfully stopped single mon ceph service")
 100             else:
 101                 log.info("Failed to stop ceph mon service")
 102             remote.run(args=['sudo', 'systemctl', 'start', mon_name])
 103             time.sleep(4)
 104         if mgr_role_name in roles:
 105             remote.run(args=['sudo', 'systemctl', 'status', mgr_name])
 106             remote.run(args=['sudo', 'systemctl', 'stop', mgr_name])
 107             time.sleep(4)  # immediate check will result in deactivating state
 108             r = remote.run(args=['sudo', 'systemctl', 'status', mgr_name],
 109                            stdout=StringIO(), check_status=False)
 110             if r.stdout.getvalue().find('Active: inactive'):
 111                 log.info("Sucessfully stopped single ceph mgr service")
 112             else:
 113                 log.info("Failed to stop ceph mgr service")
 114             remote.run(args=['sudo', 'systemctl', 'start', mgr_name])
 115             time.sleep(4)
 116         if mds_role_name in roles:
 117             remote.run(args=['sudo', 'systemctl', 'status', mds_name])
 118             remote.run(args=['sudo', 'systemctl', 'stop', mds_name])
 119             time.sleep(4)  # immediate check will result in deactivating state
 120             r = remote.run(args=['sudo', 'systemctl', 'status', mds_name],
 121                            stdout=StringIO(), check_status=False)
 122             if r.stdout.getvalue().find('Active: inactive'):
 123                 log.info("Sucessfully stopped single ceph mds service")
 124             else:
 125                 log.info("Failed to stop ceph mds service")
 126             remote.run(args=['sudo', 'systemctl', 'start', mds_name])
 127             time.sleep(4)
 128
 129     # reboot all nodes and verify the systemd units restart
 130     # workunit that runs would fail if any of the systemd unit doesnt start
 131     ctx.cluster.run(args='sudo reboot', wait=False, check_status=False)
 132     # avoid immediate reconnect
 133     time.sleep(120)
 134     reconnect(ctx, 480)  # reconnect all nodes
 135     # for debug info
 136     ctx.cluster.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
 137                           'grep', 'ceph'])
 138     # wait for HEALTH_OK
 139     mon = get_first_mon(ctx, config)
 140     (mon_remote,) = ctx.cluster.only(mon).remotes.iterkeys()
 141     wait_until_healthy(ctx, mon_remote, use_sudo=True)
 142     yield