4 Handle the setup, starting, and clean-up of a Ceph cluster.
6 from cStringIO import StringIO
18 from paramiko import SSHException
19 from ceph_manager import CephManager, write_conf
20 from tasks.cephfs.filesystem import Filesystem
21 from teuthology import misc as teuthology
22 from teuthology import contextutil
23 from teuthology import exceptions
24 from teuthology.orchestra import run
25 import ceph_client as cclient
26 from teuthology.orchestra.daemon import DaemonGroup
28 CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw']
30 log = logging.getLogger(__name__)
33 def generate_caps(type_):
35 Each call will return the next capability for each system type
36 (essentially a subset of possible role values). Valid types are osd,
46 mon='allow profile mgr',
63 for subsystem, capability in defaults[type_].items():
69 @contextlib.contextmanager
70 def ceph_log(ctx, config):
72 Create /var/log/ceph log directory that is open to everyone.
73 Add valgrind and profiling-logger directories.
76 :param config: Configuration
78 log.info('Making ceph log dir writeable by non-root...')
90 log.info('Disabling ceph logrotate...')
96 '/etc/logrotate.d/ceph',
101 log.info('Creating extra log directories...')
106 'install', '-d', '-m0777', '--',
107 '/var/log/ceph/valgrind',
108 '/var/log/ceph/profiling-logger',
114 class Rotater(object):
115 stop_event = gevent.event.Event()
117 def invoke_logrotate(self):
118 # 1) install ceph-test.conf in /etc/logrotate.d
119 # 2) continuously loop over logrotate invocation with ceph-test.conf
120 while not self.stop_event.is_set():
121 self.stop_event.wait(timeout=30)
125 args=['sudo', 'logrotate', '/etc/logrotate.d/ceph-test.conf'
130 except exceptions.ConnectionLostError as e:
131 # Some tests may power off nodes during test, in which
132 # case we will see connection errors that we should ignore.
133 log.debug("Missed logrotate, node '{0}' is offline".format(
135 except EOFError as e:
136 # Paramiko sometimes raises this when it fails to
137 # connect to a node during open_session. As with
138 # ConnectionLostError, we ignore this because nodes
139 # are allowed to get power cycled during tests.
140 log.debug("Missed logrotate, EOFError")
141 except SSHException as e:
142 log.debug("Missed logrotate, SSHException")
143 except socket.error as e:
144 if e.errno == errno.EHOSTUNREACH:
145 log.debug("Missed logrotate, host unreachable")
150 self.thread = gevent.spawn(self.invoke_logrotate)
153 self.stop_event.set()
156 def write_rotate_conf(ctx, daemons):
157 testdir = teuthology.get_testdir(ctx)
158 rotate_conf_path = os.path.join(os.path.dirname(__file__), 'logrotate.conf')
159 with file(rotate_conf_path, 'rb') as f:
161 for daemon, size in daemons.iteritems():
162 log.info('writing logrotate stanza for {daemon}'.format(daemon=daemon))
163 conf += f.read().format(daemon_type=daemon, max_size=size)
166 for remote in ctx.cluster.remotes.iterkeys():
167 teuthology.write_file(remote=remote,
168 path='{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir),
175 '{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir),
176 '/etc/logrotate.d/ceph-test.conf',
181 '/etc/logrotate.d/ceph-test.conf',
186 '/etc/logrotate.d/ceph-test.conf'
189 remote.chcon('/etc/logrotate.d/ceph-test.conf',
190 'system_u:object_r:etc_t:s0')
192 if ctx.config.get('log-rotate'):
193 daemons = ctx.config.get('log-rotate')
194 log.info('Setting up log rotation with ' + str(daemons))
195 write_rotate_conf(ctx, daemons)
196 logrotater = Rotater()
202 if ctx.config.get('log-rotate'):
203 log.info('Shutting down logrotate')
206 args=['sudo', 'rm', '/etc/logrotate.d/ceph-test.conf'
209 if ctx.archive is not None and \
210 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
212 log.info('Compressing logs...')
235 log.info('Archiving logs...')
236 path = os.path.join(ctx.archive, 'remote')
238 for remote in ctx.cluster.remotes.iterkeys():
239 sub = os.path.join(path, remote.shortname)
241 teuthology.pull_directory(remote, '/var/log/ceph',
242 os.path.join(sub, 'log'))
245 def assign_devs(roles, devs):
247 Create a dictionary of devs indexed by roles
249 :param roles: List of roles
250 :param devs: Corresponding list of devices.
251 :returns: Dictionary of devs indexed by roles.
253 return dict(zip(roles, devs))
256 @contextlib.contextmanager
257 def valgrind_post(ctx, config):
259 After the tests run, look throught all the valgrind logs. Exceptions are raised
260 if textual errors occured in the logs, or if valgrind exceptions were detected in
264 :param config: Configuration
269 lookup_procs = list()
270 log.info('Checking for errors in any valgrind logs...')
271 for remote in ctx.cluster.remotes.iterkeys():
272 # look at valgrind logs for each node
278 run.Raw('/var/log/ceph/valgrind/*'),
279 '/dev/null', # include a second file so that we always get a filename prefix on the output
289 lookup_procs.append((proc, remote))
291 valgrind_exception = None
292 for (proc, remote) in lookup_procs:
294 out = proc.stdout.getvalue()
295 for line in out.split('\n'):
299 (file, kind) = line.split(':')
301 log.error('failed to split line %s', line)
303 log.debug('file %s kind %s', file, kind)
304 if (file.find('mds') >= 0) and kind.find('Lost') > 0:
306 log.error('saw valgrind issue %s in %s', kind, file)
307 valgrind_exception = Exception('saw valgrind issues')
309 if config.get('expect_valgrind_errors'):
310 if not valgrind_exception:
311 raise Exception('expected valgrind issues and found none')
313 if valgrind_exception:
314 raise valgrind_exception
317 @contextlib.contextmanager
318 def crush_setup(ctx, config):
319 cluster_name = config['cluster']
320 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
321 (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
323 profile = config.get('crush_tunables', 'default')
324 log.info('Setting crush tunables to %s', profile)
326 args=['sudo', 'ceph', '--cluster', cluster_name,
327 'osd', 'crush', 'tunables', profile])
331 @contextlib.contextmanager
332 def create_rbd_pool(ctx, config):
333 cluster_name = config['cluster']
334 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
335 (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
336 log.info('Waiting for OSDs to come up')
337 teuthology.wait_until_osds_up(
341 ceph_cluster=cluster_name,
343 if config.get('create_rbd_pool', True):
344 log.info('Creating RBD pool')
346 args=['sudo', 'ceph', '--cluster', cluster_name,
347 'osd', 'pool', 'create', 'rbd', '8'])
350 'sudo', 'ceph', '--cluster', cluster_name,
351 'osd', 'pool', 'application', 'enable',
352 'rbd', 'rbd', '--yes-i-really-mean-it'
357 @contextlib.contextmanager
358 def cephfs_setup(ctx, config):
359 cluster_name = config['cluster']
360 testdir = teuthology.get_testdir(ctx)
361 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
363 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
364 (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
365 mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
366 # If there are any MDSs, then create a filesystem for them to use
367 # Do this last because requires mon cluster to be up and running
369 log.info('Setting up CephFS filesystem...')
371 fs = Filesystem(ctx, name='cephfs', create=True,
372 ec_profile=config.get('cephfs_ec_profile', None))
374 is_active_mds = lambda role: 'mds.' in role and not role.endswith('-s') and '-s-' not in role
375 all_roles = [item for remote_roles in mdss.remotes.values() for item in remote_roles]
376 num_active = len([r for r in all_roles if is_active_mds(r)])
378 fs.set_max_mds(num_active)
379 fs.set_allow_dirfrags(True)
384 @contextlib.contextmanager
385 def cluster(ctx, config):
387 Handle the creation and removal of a ceph cluster.
390 Create directories needed for the cluster.
391 Create remote journals for all osds.
392 Create and set keyring.
393 Copy the monmap to tht test systems.
397 Add keyring information to monmaps
401 If errors occured, extract a failure message and store in ctx.summary.
402 Unmount all test files and temporary journaling files.
403 Save the monitor information and archive all ceph logs.
404 Cleanup the keyring setup, and remove all monitor map and data files left over.
407 :param config: Configuration
409 if ctx.config.get('use_existing_cluster', False) is True:
410 log.info("'use_existing_cluster' is true; skipping cluster creation")
413 testdir = teuthology.get_testdir(ctx)
414 cluster_name = config['cluster']
415 data_dir = '{tdir}/{cluster}.data'.format(tdir=testdir, cluster=cluster_name)
416 log.info('Creating ceph cluster %s...', cluster_name)
420 'install', '-d', '-m0755', '--',
431 'install', '-d', '-m0777', '--', '/var/run/ceph',
438 remote_to_roles_to_devs = {}
439 remote_to_roles_to_journals = {}
440 osds = ctx.cluster.only(teuthology.is_type('osd', cluster_name))
441 for remote, roles_for_host in osds.remotes.iteritems():
442 devs = teuthology.get_scratch_devices(remote)
444 roles_to_journals = {}
446 log.info('fs option selected, checking for scratch devs')
447 log.info('found devs: %s' % (str(devs),))
448 devs_id_map = teuthology.get_wwn_id_map(remote, devs)
449 iddevs = devs_id_map.values()
450 roles_to_devs = assign_devs(
451 teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs
453 if len(roles_to_devs) < len(iddevs):
454 iddevs = iddevs[len(roles_to_devs):]
455 devs_to_clean[remote] = []
457 if config.get('block_journal'):
458 log.info('block journal enabled')
459 roles_to_journals = assign_devs(
460 teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs
462 log.info('journal map: %s', roles_to_journals)
464 if config.get('tmpfs_journal'):
465 log.info('tmpfs journal enabled')
466 roles_to_journals = {}
467 remote.run(args=['sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt'])
468 for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
469 tmpfs = '/mnt/' + role
470 roles_to_journals[role] = tmpfs
471 remote.run(args=['truncate', '-s', '1500M', tmpfs])
472 log.info('journal map: %s', roles_to_journals)
474 log.info('dev map: %s' % (str(roles_to_devs),))
475 remote_to_roles_to_devs[remote] = roles_to_devs
476 remote_to_roles_to_journals[remote] = roles_to_journals
478 log.info('Generating config...')
479 remotes_and_roles = ctx.cluster.remotes.items()
480 roles = [role_list for (remote, role_list) in remotes_and_roles]
481 ips = [host for (host, port) in
482 (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
483 conf = teuthology.skeleton_config(ctx, roles=roles, ips=ips, cluster=cluster_name)
484 for remote, roles_to_journals in remote_to_roles_to_journals.iteritems():
485 for role, journal in roles_to_journals.iteritems():
486 name = teuthology.ceph_role(role)
489 conf[name]['osd journal'] = journal
490 for section, keys in config['conf'].iteritems():
491 for key, value in keys.iteritems():
492 log.info("[%s] %s = %s" % (section, key, value))
493 if section not in conf:
495 conf[section][key] = value
497 if config.get('tmpfs_journal'):
498 conf['journal dio'] = False
500 if not hasattr(ctx, 'ceph'):
502 ctx.ceph[cluster_name] = argparse.Namespace()
503 ctx.ceph[cluster_name].conf = conf
505 default_keyring = '/etc/ceph/{cluster}.keyring'.format(cluster=cluster_name)
506 keyring_path = config.get('keyring_path', default_keyring)
508 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
510 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
512 log.info('Setting up %s...' % firstmon)
513 ctx.cluster.only(firstmon).run(
524 ctx.cluster.only(firstmon).run(
536 ctx.cluster.only(firstmon).run(
544 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
545 monmap_path = '{tdir}/{cluster}.monmap'.format(tdir=testdir,
546 cluster=cluster_name)
547 fsid = teuthology.create_simple_monmap(
553 if not 'global' in conf:
555 conf['global']['fsid'] = fsid
557 default_conf_path = '/etc/ceph/{cluster}.conf'.format(cluster=cluster_name)
558 conf_path = config.get('conf_path', default_conf_path)
559 log.info('Writing %s for FSID %s...' % (conf_path, fsid))
560 write_conf(ctx, conf_path, cluster_name)
562 log.info('Creating admin key on %s...' % firstmon)
563 ctx.cluster.only(firstmon).run(
571 '--name=client.admin',
573 '--cap', 'mon', 'allow *',
574 '--cap', 'osd', 'allow *',
575 '--cap', 'mds', 'allow *',
576 '--cap', 'mgr', 'allow *',
581 log.info('Copying monmap to all nodes...')
582 keyring = teuthology.get_file(
586 monmap = teuthology.get_file(
591 for rem in ctx.cluster.remotes.iterkeys():
592 # copy mon key and initial monmap
593 log.info('Sending monmap to node {remote}'.format(remote=rem))
594 teuthology.sudo_write_file(
600 teuthology.write_file(
606 log.info('Setting up mon nodes...')
607 mons = ctx.cluster.only(teuthology.is_type('mon', cluster_name))
609 if not config.get('skip_mgr_daemons', False):
610 log.info('Setting up mgr nodes...')
611 mgrs = ctx.cluster.only(teuthology.is_type('mgr', cluster_name))
612 for remote, roles_for_host in mgrs.remotes.iteritems():
613 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mgr',
615 _, _, id_ = teuthology.split_role(role)
616 mgr_dir = '/var/lib/ceph/mgr/{cluster}-{id}'.format(
617 cluster=cluster_name,
634 '--name=mgr.{id}'.format(id=id_),
635 mgr_dir + '/keyring',
639 log.info('Setting up mds nodes...')
640 mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
641 for remote, roles_for_host in mdss.remotes.iteritems():
642 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mds',
644 _, _, id_ = teuthology.split_role(role)
645 mds_dir = '/var/lib/ceph/mds/{cluster}-{id}'.format(
646 cluster=cluster_name,
663 '--name=mds.{id}'.format(id=id_),
664 mds_dir + '/keyring',
668 cclient.create_keyring(ctx, cluster_name)
669 log.info('Running mkfs on osd nodes...')
671 if not hasattr(ctx, 'disk_config'):
672 ctx.disk_config = argparse.Namespace()
673 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev'):
674 ctx.disk_config.remote_to_roles_to_dev = {}
675 if not hasattr(ctx.disk_config, 'remote_to_roles_to_journals'):
676 ctx.disk_config.remote_to_roles_to_journals = {}
677 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_mount_options'):
678 ctx.disk_config.remote_to_roles_to_dev_mount_options = {}
679 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_fstype'):
680 ctx.disk_config.remote_to_roles_to_dev_fstype = {}
682 teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_dev, remote_to_roles_to_devs)
683 teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_journals, remote_to_roles_to_journals)
685 log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r=str(ctx.disk_config.remote_to_roles_to_dev)))
686 for remote, roles_for_host in osds.remotes.iteritems():
687 roles_to_devs = remote_to_roles_to_devs[remote]
688 roles_to_journals = remote_to_roles_to_journals[remote]
690 for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
691 _, _, id_ = teuthology.split_role(role)
692 mnt_point = '/var/lib/ceph/osd/{cluster}-{id}'.format(cluster=cluster_name, id=id_)
700 log.info(str(roles_to_devs))
701 log.info(str(roles_to_journals))
703 if roles_to_devs.get(role):
704 dev = roles_to_devs[role]
705 fs = config.get('fs')
707 mkfs_options = config.get('mkfs_options')
708 mount_options = config.get('mount_options')
710 # package = 'btrfs-tools'
711 if mount_options is None:
712 mount_options = ['noatime', 'user_subvol_rm_allowed']
713 if mkfs_options is None:
714 mkfs_options = ['-m', 'single',
718 # package = 'xfsprogs'
719 if mount_options is None:
720 mount_options = ['noatime']
721 if mkfs_options is None:
722 mkfs_options = ['-f', '-i', 'size=2048']
723 if fs == 'ext4' or fs == 'ext3':
724 if mount_options is None:
725 mount_options = ['noatime', 'user_xattr']
727 if mount_options is None:
729 if mkfs_options is None:
731 mkfs = ['mkfs.%s' % fs] + mkfs_options
732 log.info('%s on %s on %s' % (mkfs, dev, remote))
733 if package is not None:
737 'apt-get', 'install', '-y', package
743 remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
744 except run.CommandFailedError:
745 # Newer btfs-tools doesn't prompt for overwrite, use -f
746 if '-f' not in mount_options:
747 mkfs_options.append('-f')
748 mkfs = ['mkfs.%s' % fs] + mkfs_options
749 log.info('%s on %s on %s' % (mkfs, dev, remote))
750 remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
752 log.info('mount %s on %s -o %s' % (dev, remote,
753 ','.join(mount_options)))
759 '-o', ','.join(mount_options),
766 'sudo', '/sbin/restorecon', mnt_point,
770 if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options:
771 ctx.disk_config.remote_to_roles_to_dev_mount_options[remote] = {}
772 ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][role] = mount_options
773 if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype:
774 ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {}
775 ctx.disk_config.remote_to_roles_to_dev_fstype[remote][role] = fs
776 devs_to_clean[remote].append(mnt_point)
778 for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
779 _, _, id_ = teuthology.split_role(role)
793 '--monmap', monmap_path,
797 log.info('Reading keys from all nodes...')
800 for remote, roles_for_host in ctx.cluster.remotes.iteritems():
801 for type_ in ['mgr', 'mds', 'osd']:
802 if type_ == 'mgr' and config.get('skip_mgr_daemons', False):
804 for role in teuthology.cluster_roles_of_type(roles_for_host, type_, cluster_name):
805 _, _, id_ = teuthology.split_role(role)
806 data = teuthology.get_file(
808 path='/var/lib/ceph/{type}/{cluster}-{id}/keyring'.format(
811 cluster=cluster_name,
815 keys.append((type_, id_, data))
817 for remote, roles_for_host in ctx.cluster.remotes.iteritems():
818 for role in teuthology.cluster_roles_of_type(roles_for_host, 'client', cluster_name):
819 _, _, id_ = teuthology.split_role(role)
820 data = teuthology.get_file(
822 path='/etc/ceph/{cluster}.client.{id}.keyring'.format(id=id_, cluster=cluster_name)
824 keys.append(('client', id_, data))
827 log.info('Adding keys to all mons...')
838 teuthology.feed_many_stdins_and_close(keys_fp, writes)
840 for type_, id_, data in keys:
850 '--name={type}.{id}'.format(
854 ] + list(generate_caps(type_)),
859 log.info('Running mkfs on mon nodes...')
860 for remote, roles_for_host in mons.remotes.iteritems():
861 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mon', cluster_name):
862 _, _, id_ = teuthology.split_role(role)
868 '/var/lib/ceph/mon/{cluster}-{id}'.format(id=id_, cluster=cluster_name),
878 '--cluster', cluster_name,
881 '--monmap', monmap_path,
882 '--keyring', keyring_path,
900 # we need to know this below
901 ctx.summary['success'] = False
904 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
906 log.info('Checking cluster log for badness...')
908 def first_in_ceph_log(pattern, excludes):
910 Find the first occurence of the pattern specified in the Ceph log,
911 Returns None if none found.
913 :param pattern: Pattern scanned for.
914 :param excludes: Patterns to ignore.
915 :return: First line of text (or None if not found)
920 '/var/log/ceph/{cluster}.log'.format(cluster=cluster_name),
922 for exclude in excludes:
923 args.extend([run.Raw('|'), 'egrep', '-v', exclude])
925 run.Raw('|'), 'head', '-n', '1',
931 stdout = r.stdout.getvalue()
936 if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
937 config['log_whitelist']) is not None:
938 log.warning('Found errors (ERR|WRN|SEC) in cluster log')
939 ctx.summary['success'] = False
940 # use the most severe problem as the failure reason
941 if 'failure_reason' not in ctx.summary:
942 for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
943 match = first_in_ceph_log(pattern, config['log_whitelist'])
944 if match is not None:
945 ctx.summary['failure_reason'] = \
946 '"{match}" in cluster log'.format(
947 match=match.rstrip('\n'),
951 for remote, dirs in devs_to_clean.iteritems():
953 log.info('Unmounting %s on %s' % (dir_, remote))
965 except Exception as e:
968 run.Raw('PATH=/usr/sbin:$PATH'),
975 if config.get('tmpfs_journal'):
976 log.info('tmpfs journal enabled - unmounting tmpfs at /mnt')
977 for remote, roles_for_host in osds.remotes.iteritems():
979 args=['sudo', 'umount', '-f', '/mnt'],
983 if ctx.archive is not None and \
984 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
986 # archive mon data, too
987 log.info('Archiving mon data...')
988 path = os.path.join(ctx.archive, 'data')
992 if e.errno == errno.EEXIST:
996 for remote, roles in mons.remotes.iteritems():
998 is_mon = teuthology.is_type('mon', cluster_name)
1000 _, _, id_ = teuthology.split_role(role)
1001 mon_dir = '/var/lib/ceph/mon/' + \
1002 '{0}-{1}'.format(cluster_name, id_)
1003 teuthology.pull_directory_tarball(
1006 path + '/' + role + '.tgz')
1008 log.info('Cleaning ceph cluster...')
1020 run.Raw('{tdir}/../*.pid'.format(tdir=testdir)),
1027 def osd_scrub_pgs(ctx, config):
1029 Scrub pgs when we exit.
1031 First make sure all pgs are active and clean.
1032 Next scrub all osds.
1033 Then periodically check until all pgs have scrub time stamps that
1034 indicate the last scrub completed. Time out if no progess is made
1035 here after two minutes.
1039 cluster_name = config['cluster']
1040 manager = ctx.managers[cluster_name]
1042 for _ in range(0, retries):
1043 stats = manager.get_pg_stats()
1044 bad = [stat['pgid'] for stat in stats if 'active+clean' not in stat['state']]
1049 "Waiting for all PGs to be active and clean, waiting on %s" % bad)
1052 raise RuntimeError("Scrubbing terminated -- not all pgs were active and clean.")
1053 check_time_now = time.localtime()
1055 all_roles = teuthology.all_roles(ctx.cluster)
1056 for role in teuthology.cluster_roles_of_type(all_roles, 'osd', cluster_name):
1057 log.info("Scrubbing {osd}".format(osd=role))
1058 _, _, id_ = teuthology.split_role(role)
1059 # allow this to fail; in certain cases the OSD might not be up
1060 # at this point. we will catch all pgs below.
1062 manager.raw_cluster_cmd('osd', 'deep-scrub', id_)
1063 except run.CommandFailedError:
1069 stats = manager.get_pg_stats()
1070 timez = [(stat['pgid'],stat['last_scrub_stamp']) for stat in stats]
1073 for (pgid, tmval) in timez:
1074 pgtm = time.strptime(tmval[0:tmval.find('.')], '%Y-%m-%d %H:%M:%S')
1075 if pgtm > check_time_now:
1078 log.info('pgid %s last_scrub_stamp %s %s <= %s', pgid, tmval, pgtm, check_time_now)
1080 if thiscnt > prev_good:
1085 if gap_cnt % 6 == 0:
1086 for (pgid, tmval) in timez:
1087 # re-request scrub every so often in case the earlier
1088 # request was missed. do not do it everytime because
1089 # the scrub may be in progress or not reported yet and
1090 # we will starve progress.
1091 manager.raw_cluster_cmd('pg', 'deep-scrub', pgid)
1092 if gap_cnt > retries:
1093 raise RuntimeError('Exiting scrub checking -- not all pgs scrubbed.')
1095 log.info('Still waiting for all pgs to be scrubbed.')
1099 @contextlib.contextmanager
1100 def run_daemon(ctx, config, type_):
1102 Run daemons for a role type. Handle the startup and termination of a a daemon.
1103 On startup -- set coverages, cpu_profile, valgrind values for all remotes,
1104 and a max_mds value for one mds.
1105 On cleanup -- Stop all existing daemons of this type.
1108 :param config: Configuration
1109 :paran type_: Role type
1111 cluster_name = config['cluster']
1112 log.info('Starting %s daemons in cluster %s...', type_, cluster_name)
1113 testdir = teuthology.get_testdir(ctx)
1114 daemons = ctx.cluster.only(teuthology.is_type(type_, cluster_name))
1116 # check whether any daemons if this type are configured
1119 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
1121 daemon_signal = 'kill'
1122 if config.get('coverage') or config.get('valgrind') is not None:
1123 daemon_signal = 'term'
1125 # create osds in order. (this only matters for pre-luminous, which might
1126 # be hammer, which doesn't take an id_ argument to legacy 'osd create').
1128 for remote, roles_for_host in daemons.remotes.iteritems():
1129 is_type_ = teuthology.is_type(type_, cluster_name)
1130 for role in roles_for_host:
1131 if not is_type_(role):
1133 _, _, id_ = teuthology.split_role(role)
1137 datadir='/var/lib/ceph/osd/{cluster}-{id}'.format(
1138 cluster=cluster_name, id=id_)
1139 osd_uuid = teuthology.get_file(
1141 path=datadir + '/fsid',
1144 osd_uuids[id_] = osd_uuid
1145 for osd_id in range(len(osd_uuids)):
1147 osd_uuid = osd_uuids.get(id_)
1151 'sudo', 'ceph', '--cluster', cluster_name,
1152 'osd', 'new', osd_uuid, id_,
1156 # fallback to pre-luminous (hammer or jewel)
1159 'sudo', 'ceph', '--cluster', cluster_name,
1160 'osd', 'create', osd_uuid,
1163 if config.get('add_osds_to_crush'):
1166 'sudo', 'ceph', '--cluster', cluster_name,
1167 'osd', 'crush', 'create-or-move', 'osd.' + id_,
1168 '1.0', 'host=localhost', 'root=default',
1172 for remote, roles_for_host in daemons.remotes.iteritems():
1173 is_type_ = teuthology.is_type(type_, cluster_name)
1174 for role in roles_for_host:
1175 if not is_type_(role):
1177 _, _, id_ = teuthology.split_role(role)
1188 'ceph-%s' % (type_),
1190 '--cluster', cluster_name,
1193 if type_ in config.get('cpu_profile', []):
1194 profile_path = '/var/log/ceph/profiling-logger/%s.prof' % (role)
1195 run_cmd.extend(['env', 'CPUPROFILE=%s' % profile_path])
1197 if config.get('valgrind') is not None:
1198 valgrind_args = None
1199 if type_ in config['valgrind']:
1200 valgrind_args = config['valgrind'][type_]
1201 if role in config['valgrind']:
1202 valgrind_args = config['valgrind'][role]
1203 run_cmd = teuthology.get_valgrind_args(testdir, role,
1207 run_cmd.extend(run_cmd_tail)
1209 # always register mgr; don't necessarily start
1210 ctx.daemons.register_daemon(
1212 cluster=cluster_name,
1214 logger=log.getChild(role),
1218 if type_ != 'mgr' or not config.get('skip_mgr_daemons', False):
1219 role = cluster_name + '.' + type_
1220 ctx.daemons.get_daemon(type_, id_, cluster_name).restart()
1225 teuthology.stop_daemons_of_type(ctx, type_, cluster_name)
1228 def healthy(ctx, config):
1230 Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.
1233 :param config: Configuration
1235 config = config if isinstance(config, dict) else dict()
1236 cluster_name = config.get('cluster', 'ceph')
1237 log.info('Waiting until %s daemons up and pgs clean...', cluster_name)
1238 manager = ctx.managers[cluster_name]
1240 manager.wait_for_mgr_available(timeout=30)
1241 except (run.CommandFailedError, AssertionError) as e:
1242 log.info('ignoring mgr wait error, probably testing upgrade: %s', e)
1244 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1245 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1246 teuthology.wait_until_osds_up(
1248 cluster=ctx.cluster,
1250 ceph_cluster=cluster_name,
1254 manager.flush_all_pg_stats()
1255 except (run.CommandFailedError, Exception) as e:
1256 log.info('ignoring flush pg stats error, probably testing upgrade: %s', e)
1257 manager.wait_for_clean()
1259 log.info('Waiting until ceph cluster %s is healthy...', cluster_name)
1260 teuthology.wait_until_healthy(
1263 ceph_cluster=cluster_name,
1266 if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes:
1267 # Some MDSs exist, wait for them to be healthy
1268 ceph_fs = Filesystem(ctx) # TODO: make Filesystem cluster-aware
1269 ceph_fs.wait_for_daemons(timeout=300)
1272 def wait_for_osds_up(ctx, config):
1274 Wait for all osd's to come up.
1277 :param config: Configuration
1279 log.info('Waiting until ceph osds are all up...')
1280 cluster_name = config.get('cluster', 'ceph')
1281 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1282 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1283 teuthology.wait_until_osds_up(
1285 cluster=ctx.cluster,
1290 def wait_for_mon_quorum(ctx, config):
1292 Check renote ceph status until all monitors are up.
1295 :param config: Configuration
1297 if isinstance(config, dict):
1298 mons = config['daemons']
1299 cluster_name = config.get('cluster', 'ceph')
1301 assert isinstance(config, list)
1303 cluster_name = 'ceph'
1304 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1305 (remote,) = ctx.cluster.only(firstmon).remotes.keys()
1306 with contextutil.safe_while(sleep=10, tries=60,
1307 action='wait for monitor quorum') as proceed:
1316 logger=log.getChild('quorum_status'),
1318 j = json.loads(r.stdout.getvalue())
1319 q = j.get('quorum_names', [])
1320 log.debug('Quorum: %s', q)
1321 if sorted(q) == sorted(mons):
1325 def created_pool(ctx, config):
1327 Add new pools to the dictionary of pools that the ceph-manager
1330 for new_pool in config:
1331 if new_pool not in ctx.managers['ceph'].pools:
1332 ctx.managers['ceph'].pools[new_pool] = ctx.managers['ceph'].get_pool_property(
1336 @contextlib.contextmanager
1337 def restart(ctx, config):
1339 restart ceph daemons
1343 - ceph.restart: [all]
1347 - ceph.restart: [osd.0, mon.1, mds.*]
1353 daemons: [osd.0, mon.1]
1354 wait-for-healthy: false
1355 wait-for-osds-up: true
1358 :param config: Configuration
1362 elif isinstance(config, list):
1363 config = {'daemons': config}
1365 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1367 for role in daemons:
1368 cluster, type_, id_ = teuthology.split_role(role)
1369 ctx.daemons.get_daemon(type_, id_, cluster).restart()
1370 clusters.add(cluster)
1372 manager = ctx.managers['ceph']
1373 for dmon in daemons:
1375 dm_parts = dmon.split('.')
1376 if dm_parts[1].isdigit():
1377 if dm_parts[0] == 'osd':
1378 manager.mark_down_osd(int(dm_parts[1]))
1380 if config.get('wait-for-healthy', True):
1381 for cluster in clusters:
1382 healthy(ctx=ctx, config=dict(cluster=cluster))
1383 if config.get('wait-for-osds-up', False):
1384 for cluster in clusters:
1385 wait_for_osds_up(ctx=ctx, config=dict(cluster=cluster))
1389 @contextlib.contextmanager
1390 def stop(ctx, config):
1396 - ceph.stop: [mds.*]
1399 - ceph.stop: [osd.0, osd.2]
1403 daemons: [osd.0, osd.2]
1408 elif isinstance(config, list):
1409 config = {'daemons': config}
1411 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1412 for role in daemons:
1413 cluster, type_, id_ = teuthology.split_role(role)
1414 ctx.daemons.get_daemon(type_, id_, cluster).stop()
1419 @contextlib.contextmanager
1420 def wait_for_failure(ctx, config):
1422 Wait for a failure of a ceph daemon
1426 - ceph.wait_for_failure: [mds.*]
1429 - ceph.wait_for_failure: [osd.0, osd.2]
1432 - ceph.wait_for_failure:
1433 daemons: [osd.0, osd.2]
1438 elif isinstance(config, list):
1439 config = {'daemons': config}
1441 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1442 for role in daemons:
1443 cluster, type_, id_ = teuthology.split_role(role)
1445 ctx.daemons.get_daemon(type_, id_, cluster).wait()
1447 log.info('Saw expected daemon failure. Continuing.')
1450 raise RuntimeError('daemon %s did not fail' % role)
1455 def validate_config(ctx, config):
1457 Perform some simple validation on task configuration.
1458 Raises exceptions.ConfigError if an error is found.
1460 # check for osds from multiple clusters on the same host
1461 for remote, roles_for_host in ctx.cluster.remotes.items():
1464 for role in roles_for_host:
1465 role_cluster, role_type, _ = teuthology.split_role(role)
1466 if role_type != 'osd':
1468 if last_cluster and last_cluster != role_cluster:
1469 msg = "Host should not have osds (%s and %s) from multiple clusters" % (
1471 raise exceptions.ConfigError(msg)
1472 last_cluster = role_cluster
1476 @contextlib.contextmanager
1477 def task(ctx, config):
1479 Set up and tear down a Ceph cluster.
1487 You can also specify what branch to run::
1503 sha1: 1376a5ab0c89780eab39ffbbe436f6a6092314ed
1505 Or a local source dir::
1509 path: /home/sage/ceph
1511 To capture code coverage data, use::
1517 To use btrfs, ext4, or xfs on the target's scratch disks, use::
1522 mkfs_options: [-b,size=65536,-l,logdev=/dev/sdc1]
1523 mount_options: [nobarrier, inode64]
1525 Note, this will cause the task to check the /scratch_devs file on each node
1526 for available devices. If no such file is found, /dev/sdb will be used.
1528 To run some daemons under valgrind, include their names
1529 and the tool/args to use in a valgrind section::
1534 mds.1: --tool=memcheck
1535 osd.1: [--tool=memcheck, --leak-check=no]
1537 Those nodes which are using memcheck or valgrind will get
1538 checked for bad results.
1540 To adjust or modify config options, use::
1555 other key: other value
1560 By default, the cluster log is checked for errors and warnings,
1561 and the run marked failed if any appear. You can ignore log
1562 entries by giving a list of egrep compatible regexes, i.e.:
1566 log-whitelist: ['foo.*bar', 'bad message']
1568 To run multiple ceph clusters, use multiple ceph tasks, and roles
1569 with a cluster name prefix, e.g. cluster1.client.0. Roles with no
1570 cluster use the default cluster name, 'ceph'. OSDs from separate
1571 clusters must be on separate hosts. Clients and non-osd daemons
1572 from multiple clusters may be colocated. For each cluster, add an
1573 instance of the ceph task with the cluster name specified, e.g.::
1576 - [mon.a, osd.0, osd.1]
1577 - [backup.mon.a, backup.osd.0, backup.osd.1]
1578 - [client.0, backup.client.0]
1586 :param config: Configuration
1591 assert isinstance(config, dict), \
1592 "task ceph only supports a dictionary for configuration"
1594 overrides = ctx.config.get('overrides', {})
1595 teuthology.deep_merge(config, overrides.get('ceph', {}))
1597 first_ceph_cluster = False
1598 if not hasattr(ctx, 'daemons'):
1599 first_ceph_cluster = True
1600 ctx.daemons = DaemonGroup()
1602 testdir = teuthology.get_testdir(ctx)
1603 if config.get('coverage'):
1604 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
1605 log.info('Creating coverage directory...')
1609 'install', '-d', '-m0755', '--',
1616 if 'cluster' not in config:
1617 config['cluster'] = 'ceph'
1619 validate_config(ctx, config)
1622 if first_ceph_cluster:
1623 # these tasks handle general log setup and parsing on all hosts,
1624 # so they should only be run once
1626 lambda: ceph_log(ctx=ctx, config=None),
1627 lambda: valgrind_post(ctx=ctx, config=config),
1631 lambda: cluster(ctx=ctx, config=dict(
1632 conf=config.get('conf', {}),
1633 fs=config.get('fs', 'xfs'),
1634 mkfs_options=config.get('mkfs_options', None),
1635 mount_options=config.get('mount_options', None),
1636 block_journal=config.get('block_journal', None),
1637 tmpfs_journal=config.get('tmpfs_journal', None),
1638 skip_mgr_daemons=config.get('skip_mgr_daemons', False),
1639 log_whitelist=config.get('log-whitelist', []),
1640 cpu_profile=set(config.get('cpu_profile', []),),
1641 cluster=config['cluster'],
1643 lambda: run_daemon(ctx=ctx, config=config, type_='mon'),
1644 lambda: run_daemon(ctx=ctx, config=config, type_='mgr'),
1645 lambda: crush_setup(ctx=ctx, config=config),
1646 lambda: run_daemon(ctx=ctx, config=config, type_='osd'),
1647 lambda: create_rbd_pool(ctx=ctx, config=config),
1648 lambda: cephfs_setup(ctx=ctx, config=config),
1649 lambda: run_daemon(ctx=ctx, config=config, type_='mds'),
1652 with contextutil.nested(*subtasks):
1653 first_mon = teuthology.get_first_mon(ctx, config, config['cluster'])
1654 (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
1655 if not hasattr(ctx, 'managers'):
1657 ctx.managers[config['cluster']] = CephManager(
1660 logger=log.getChild('ceph_manager.' + config['cluster']),
1661 cluster=config['cluster'],
1665 if config.get('wait-for-healthy', True):
1666 healthy(ctx=ctx, config=dict(cluster=config['cluster']))
1670 if config.get('wait-for-scrub', True):
1671 osd_scrub_pgs(ctx, config)
1673 # stop logging health to clog during shutdown, or else we generate
1674 # a bunch of scary messages unrelated to our actual run.
1675 firstmon = teuthology.get_first_mon(ctx, config, config['cluster'])
1676 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1681 '--cluster', config['cluster'],
1686 '--no-mon-health-to-clog',