Fix some bugs when testing opensds ansible
[stor4nfv.git] / src / ceph / qa / tasks / ceph.py
1 """
2 Ceph cluster task.
3
4 Handle the setup, starting, and clean-up of a Ceph cluster.
5 """
6 from cStringIO import StringIO
7
8 import argparse
9 import contextlib
10 import errno
11 import logging
12 import os
13 import json
14 import time
15 import gevent
16 import socket
17
18 from paramiko import SSHException
19 from ceph_manager import CephManager, write_conf
20 from tasks.cephfs.filesystem import Filesystem
21 from teuthology import misc as teuthology
22 from teuthology import contextutil
23 from teuthology import exceptions
24 from teuthology.orchestra import run
25 import ceph_client as cclient
26 from teuthology.orchestra.daemon import DaemonGroup
27
28 CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw']
29
30 log = logging.getLogger(__name__)
31
32
33 def generate_caps(type_):
34     """
35     Each call will return the next capability for each system type
36     (essentially a subset of possible role values).  Valid types are osd,
37     mds and client.
38     """
39     defaults = dict(
40         osd=dict(
41             mon='allow *',
42             mgr='allow *',
43             osd='allow *',
44         ),
45         mgr=dict(
46             mon='allow profile mgr',
47             osd='allow *',
48             mds='allow *',
49         ),
50         mds=dict(
51             mon='allow *',
52             mgr='allow *',
53             osd='allow *',
54             mds='allow',
55         ),
56         client=dict(
57             mon='allow rw',
58             mgr='allow r',
59             osd='allow rwx',
60             mds='allow',
61         ),
62     )
63     for subsystem, capability in defaults[type_].items():
64         yield '--cap'
65         yield subsystem
66         yield capability
67
68
69 @contextlib.contextmanager
70 def ceph_log(ctx, config):
71     """
72     Create /var/log/ceph log directory that is open to everyone.
73     Add valgrind and profiling-logger directories.
74
75     :param ctx: Context
76     :param config: Configuration
77     """
78     log.info('Making ceph log dir writeable by non-root...')
79     run.wait(
80         ctx.cluster.run(
81             args=[
82                 'sudo',
83                 'chmod',
84                 '777',
85                 '/var/log/ceph',
86             ],
87             wait=False,
88         )
89     )
90     log.info('Disabling ceph logrotate...')
91     run.wait(
92         ctx.cluster.run(
93             args=[
94                 'sudo',
95                 'rm', '-f', '--',
96                 '/etc/logrotate.d/ceph',
97             ],
98             wait=False,
99         )
100     )
101     log.info('Creating extra log directories...')
102     run.wait(
103         ctx.cluster.run(
104             args=[
105                 'sudo',
106                 'install', '-d', '-m0777', '--',
107                 '/var/log/ceph/valgrind',
108                 '/var/log/ceph/profiling-logger',
109             ],
110             wait=False,
111         )
112     )
113
114     class Rotater(object):
115         stop_event = gevent.event.Event()
116
117         def invoke_logrotate(self):
118             # 1) install ceph-test.conf in /etc/logrotate.d
119             # 2) continuously loop over logrotate invocation with ceph-test.conf
120             while not self.stop_event.is_set():
121                 self.stop_event.wait(timeout=30)
122                 try:
123                     run.wait(
124                         ctx.cluster.run(
125                             args=['sudo', 'logrotate', '/etc/logrotate.d/ceph-test.conf'
126                                   ],
127                             wait=False,
128                         )
129                     )
130                 except exceptions.ConnectionLostError as e:
131                     # Some tests may power off nodes during test, in which
132                     # case we will see connection errors that we should ignore.
133                     log.debug("Missed logrotate, node '{0}' is offline".format(
134                         e.node))
135                 except EOFError as e:
136                     # Paramiko sometimes raises this when it fails to
137                     # connect to a node during open_session.  As with
138                     # ConnectionLostError, we ignore this because nodes
139                     # are allowed to get power cycled during tests.
140                     log.debug("Missed logrotate, EOFError")
141                 except SSHException as e:
142                     log.debug("Missed logrotate, SSHException")
143                 except socket.error as e:
144                     if e.errno == errno.EHOSTUNREACH:
145                         log.debug("Missed logrotate, host unreachable")
146                     else:
147                         raise
148
149         def begin(self):
150             self.thread = gevent.spawn(self.invoke_logrotate)
151
152         def end(self):
153             self.stop_event.set()
154             self.thread.get()
155
156     def write_rotate_conf(ctx, daemons):
157         testdir = teuthology.get_testdir(ctx)
158         rotate_conf_path = os.path.join(os.path.dirname(__file__), 'logrotate.conf')
159         with file(rotate_conf_path, 'rb') as f:
160             conf = ""
161             for daemon, size in daemons.iteritems():
162                 log.info('writing logrotate stanza for {daemon}'.format(daemon=daemon))
163                 conf += f.read().format(daemon_type=daemon, max_size=size)
164                 f.seek(0, 0)
165
166             for remote in ctx.cluster.remotes.iterkeys():
167                 teuthology.write_file(remote=remote,
168                                       path='{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir),
169                                       data=StringIO(conf)
170                                       )
171                 remote.run(
172                     args=[
173                         'sudo',
174                         'mv',
175                         '{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir),
176                         '/etc/logrotate.d/ceph-test.conf',
177                         run.Raw('&&'),
178                         'sudo',
179                         'chmod',
180                         '0644',
181                         '/etc/logrotate.d/ceph-test.conf',
182                         run.Raw('&&'),
183                         'sudo',
184                         'chown',
185                         'root.root',
186                         '/etc/logrotate.d/ceph-test.conf'
187                     ]
188                 )
189                 remote.chcon('/etc/logrotate.d/ceph-test.conf',
190                              'system_u:object_r:etc_t:s0')
191
192     if ctx.config.get('log-rotate'):
193         daemons = ctx.config.get('log-rotate')
194         log.info('Setting up log rotation with ' + str(daemons))
195         write_rotate_conf(ctx, daemons)
196         logrotater = Rotater()
197         logrotater.begin()
198     try:
199         yield
200
201     finally:
202         if ctx.config.get('log-rotate'):
203             log.info('Shutting down logrotate')
204             logrotater.end()
205             ctx.cluster.run(
206                 args=['sudo', 'rm', '/etc/logrotate.d/ceph-test.conf'
207                       ]
208             )
209         if ctx.archive is not None and \
210                 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
211             # and logs
212             log.info('Compressing logs...')
213             run.wait(
214                 ctx.cluster.run(
215                     args=[
216                         'sudo',
217                         'find',
218                         '/var/log/ceph',
219                         '-name',
220                         '*.log',
221                         '-print0',
222                         run.Raw('|'),
223                         'sudo',
224                         'xargs',
225                         '-0',
226                         '--no-run-if-empty',
227                         '--',
228                         'gzip',
229                         '--',
230                     ],
231                     wait=False,
232                 ),
233             )
234
235             log.info('Archiving logs...')
236             path = os.path.join(ctx.archive, 'remote')
237             os.makedirs(path)
238             for remote in ctx.cluster.remotes.iterkeys():
239                 sub = os.path.join(path, remote.shortname)
240                 os.makedirs(sub)
241                 teuthology.pull_directory(remote, '/var/log/ceph',
242                                           os.path.join(sub, 'log'))
243
244
245 def assign_devs(roles, devs):
246     """
247     Create a dictionary of devs indexed by roles
248
249     :param roles: List of roles
250     :param devs: Corresponding list of devices.
251     :returns: Dictionary of devs indexed by roles.
252     """
253     return dict(zip(roles, devs))
254
255
256 @contextlib.contextmanager
257 def valgrind_post(ctx, config):
258     """
259     After the tests run, look throught all the valgrind logs.  Exceptions are raised
260     if textual errors occured in the logs, or if valgrind exceptions were detected in
261     the logs.
262
263     :param ctx: Context
264     :param config: Configuration
265     """
266     try:
267         yield
268     finally:
269         lookup_procs = list()
270         log.info('Checking for errors in any valgrind logs...')
271         for remote in ctx.cluster.remotes.iterkeys():
272             # look at valgrind logs for each node
273             proc = remote.run(
274                 args=[
275                     'sudo',
276                     'zgrep',
277                     '<kind>',
278                     run.Raw('/var/log/ceph/valgrind/*'),
279                     '/dev/null',  # include a second file so that we always get a filename prefix on the output
280                     run.Raw('|'),
281                     'sort',
282                     run.Raw('|'),
283                     'uniq',
284                 ],
285                 wait=False,
286                 check_status=False,
287                 stdout=StringIO(),
288             )
289             lookup_procs.append((proc, remote))
290
291         valgrind_exception = None
292         for (proc, remote) in lookup_procs:
293             proc.wait()
294             out = proc.stdout.getvalue()
295             for line in out.split('\n'):
296                 if line == '':
297                     continue
298                 try:
299                     (file, kind) = line.split(':')
300                 except Exception:
301                     log.error('failed to split line %s', line)
302                     raise
303                 log.debug('file %s kind %s', file, kind)
304                 if (file.find('mds') >= 0) and kind.find('Lost') > 0:
305                     continue
306                 log.error('saw valgrind issue %s in %s', kind, file)
307                 valgrind_exception = Exception('saw valgrind issues')
308
309         if config.get('expect_valgrind_errors'):
310             if not valgrind_exception:
311                 raise Exception('expected valgrind issues and found none')
312         else:
313             if valgrind_exception:
314                 raise valgrind_exception
315
316
317 @contextlib.contextmanager
318 def crush_setup(ctx, config):
319     cluster_name = config['cluster']
320     first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
321     (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
322
323     profile = config.get('crush_tunables', 'default')
324     log.info('Setting crush tunables to %s', profile)
325     mon_remote.run(
326         args=['sudo', 'ceph', '--cluster', cluster_name,
327               'osd', 'crush', 'tunables', profile])
328     yield
329
330
331 @contextlib.contextmanager
332 def create_rbd_pool(ctx, config):
333     cluster_name = config['cluster']
334     first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
335     (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
336     log.info('Waiting for OSDs to come up')
337     teuthology.wait_until_osds_up(
338         ctx,
339         cluster=ctx.cluster,
340         remote=mon_remote,
341         ceph_cluster=cluster_name,
342     )
343     if config.get('create_rbd_pool', True):
344         log.info('Creating RBD pool')
345         mon_remote.run(
346             args=['sudo', 'ceph', '--cluster', cluster_name,
347                   'osd', 'pool', 'create', 'rbd', '8'])
348         mon_remote.run(
349             args=[
350                 'sudo', 'ceph', '--cluster', cluster_name,
351                 'osd', 'pool', 'application', 'enable',
352                 'rbd', 'rbd', '--yes-i-really-mean-it'
353             ],
354             check_status=False)
355     yield
356
357 @contextlib.contextmanager
358 def cephfs_setup(ctx, config):
359     cluster_name = config['cluster']
360     testdir = teuthology.get_testdir(ctx)
361     coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
362
363     first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
364     (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
365     mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
366     # If there are any MDSs, then create a filesystem for them to use
367     # Do this last because requires mon cluster to be up and running
368     if mdss.remotes:
369         log.info('Setting up CephFS filesystem...')
370
371         fs = Filesystem(ctx, name='cephfs', create=True,
372                         ec_profile=config.get('cephfs_ec_profile', None))
373
374         is_active_mds = lambda role: 'mds.' in role and not role.endswith('-s') and '-s-' not in role
375         all_roles = [item for remote_roles in mdss.remotes.values() for item in remote_roles]
376         num_active = len([r for r in all_roles if is_active_mds(r)])
377
378         fs.set_max_mds(num_active)
379         fs.set_allow_dirfrags(True)
380
381     yield
382
383
384 @contextlib.contextmanager
385 def cluster(ctx, config):
386     """
387     Handle the creation and removal of a ceph cluster.
388
389     On startup:
390         Create directories needed for the cluster.
391         Create remote journals for all osds.
392         Create and set keyring.
393         Copy the monmap to tht test systems.
394         Setup mon nodes.
395         Setup mds nodes.
396         Mkfs osd nodes.
397         Add keyring information to monmaps
398         Mkfs mon nodes.
399
400     On exit:
401         If errors occured, extract a failure message and store in ctx.summary.
402         Unmount all test files and temporary journaling files.
403         Save the monitor information and archive all ceph logs.
404         Cleanup the keyring setup, and remove all monitor map and data files left over.
405
406     :param ctx: Context
407     :param config: Configuration
408     """
409     if ctx.config.get('use_existing_cluster', False) is True:
410         log.info("'use_existing_cluster' is true; skipping cluster creation")
411         yield
412
413     testdir = teuthology.get_testdir(ctx)
414     cluster_name = config['cluster']
415     data_dir = '{tdir}/{cluster}.data'.format(tdir=testdir, cluster=cluster_name)
416     log.info('Creating ceph cluster %s...', cluster_name)
417     run.wait(
418         ctx.cluster.run(
419             args=[
420                 'install', '-d', '-m0755', '--',
421                 data_dir,
422             ],
423             wait=False,
424         )
425     )
426
427     run.wait(
428         ctx.cluster.run(
429             args=[
430                 'sudo',
431                 'install', '-d', '-m0777', '--', '/var/run/ceph',
432             ],
433             wait=False,
434         )
435     )
436
437     devs_to_clean = {}
438     remote_to_roles_to_devs = {}
439     remote_to_roles_to_journals = {}
440     osds = ctx.cluster.only(teuthology.is_type('osd', cluster_name))
441     for remote, roles_for_host in osds.remotes.iteritems():
442         devs = teuthology.get_scratch_devices(remote)
443         roles_to_devs = {}
444         roles_to_journals = {}
445         if config.get('fs'):
446             log.info('fs option selected, checking for scratch devs')
447             log.info('found devs: %s' % (str(devs),))
448             devs_id_map = teuthology.get_wwn_id_map(remote, devs)
449             iddevs = devs_id_map.values()
450             roles_to_devs = assign_devs(
451                 teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs
452             )
453             if len(roles_to_devs) < len(iddevs):
454                 iddevs = iddevs[len(roles_to_devs):]
455             devs_to_clean[remote] = []
456
457         if config.get('block_journal'):
458             log.info('block journal enabled')
459             roles_to_journals = assign_devs(
460                 teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs
461             )
462             log.info('journal map: %s', roles_to_journals)
463
464         if config.get('tmpfs_journal'):
465             log.info('tmpfs journal enabled')
466             roles_to_journals = {}
467             remote.run(args=['sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt'])
468             for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
469                 tmpfs = '/mnt/' + role
470                 roles_to_journals[role] = tmpfs
471                 remote.run(args=['truncate', '-s', '1500M', tmpfs])
472             log.info('journal map: %s', roles_to_journals)
473
474         log.info('dev map: %s' % (str(roles_to_devs),))
475         remote_to_roles_to_devs[remote] = roles_to_devs
476         remote_to_roles_to_journals[remote] = roles_to_journals
477
478     log.info('Generating config...')
479     remotes_and_roles = ctx.cluster.remotes.items()
480     roles = [role_list for (remote, role_list) in remotes_and_roles]
481     ips = [host for (host, port) in
482            (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
483     conf = teuthology.skeleton_config(ctx, roles=roles, ips=ips, cluster=cluster_name)
484     for remote, roles_to_journals in remote_to_roles_to_journals.iteritems():
485         for role, journal in roles_to_journals.iteritems():
486             name = teuthology.ceph_role(role)
487             if name not in conf:
488                 conf[name] = {}
489             conf[name]['osd journal'] = journal
490     for section, keys in config['conf'].iteritems():
491         for key, value in keys.iteritems():
492             log.info("[%s] %s = %s" % (section, key, value))
493             if section not in conf:
494                 conf[section] = {}
495             conf[section][key] = value
496
497     if config.get('tmpfs_journal'):
498         conf['journal dio'] = False
499
500     if not hasattr(ctx, 'ceph'):
501         ctx.ceph = {}
502     ctx.ceph[cluster_name] = argparse.Namespace()
503     ctx.ceph[cluster_name].conf = conf
504
505     default_keyring = '/etc/ceph/{cluster}.keyring'.format(cluster=cluster_name)
506     keyring_path = config.get('keyring_path', default_keyring)
507
508     coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
509
510     firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
511
512     log.info('Setting up %s...' % firstmon)
513     ctx.cluster.only(firstmon).run(
514         args=[
515             'sudo',
516             'adjust-ulimits',
517             'ceph-coverage',
518             coverage_dir,
519             'ceph-authtool',
520             '--create-keyring',
521             keyring_path,
522         ],
523     )
524     ctx.cluster.only(firstmon).run(
525         args=[
526             'sudo',
527             'adjust-ulimits',
528             'ceph-coverage',
529             coverage_dir,
530             'ceph-authtool',
531             '--gen-key',
532             '--name=mon.',
533             keyring_path,
534         ],
535     )
536     ctx.cluster.only(firstmon).run(
537         args=[
538             'sudo',
539             'chmod',
540             '0644',
541             keyring_path,
542         ],
543     )
544     (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
545     monmap_path = '{tdir}/{cluster}.monmap'.format(tdir=testdir,
546                                                    cluster=cluster_name)
547     fsid = teuthology.create_simple_monmap(
548         ctx,
549         remote=mon0_remote,
550         conf=conf,
551         path=monmap_path,
552     )
553     if not 'global' in conf:
554         conf['global'] = {}
555     conf['global']['fsid'] = fsid
556
557     default_conf_path = '/etc/ceph/{cluster}.conf'.format(cluster=cluster_name)
558     conf_path = config.get('conf_path', default_conf_path)
559     log.info('Writing %s for FSID %s...' % (conf_path, fsid))
560     write_conf(ctx, conf_path, cluster_name)
561
562     log.info('Creating admin key on %s...' % firstmon)
563     ctx.cluster.only(firstmon).run(
564         args=[
565             'sudo',
566             'adjust-ulimits',
567             'ceph-coverage',
568             coverage_dir,
569             'ceph-authtool',
570             '--gen-key',
571             '--name=client.admin',
572             '--set-uid=0',
573             '--cap', 'mon', 'allow *',
574             '--cap', 'osd', 'allow *',
575             '--cap', 'mds', 'allow *',
576             '--cap', 'mgr', 'allow *',
577             keyring_path,
578         ],
579     )
580
581     log.info('Copying monmap to all nodes...')
582     keyring = teuthology.get_file(
583         remote=mon0_remote,
584         path=keyring_path,
585     )
586     monmap = teuthology.get_file(
587         remote=mon0_remote,
588         path=monmap_path,
589     )
590
591     for rem in ctx.cluster.remotes.iterkeys():
592         # copy mon key and initial monmap
593         log.info('Sending monmap to node {remote}'.format(remote=rem))
594         teuthology.sudo_write_file(
595             remote=rem,
596             path=keyring_path,
597             data=keyring,
598             perms='0644'
599         )
600         teuthology.write_file(
601             remote=rem,
602             path=monmap_path,
603             data=monmap,
604         )
605
606     log.info('Setting up mon nodes...')
607     mons = ctx.cluster.only(teuthology.is_type('mon', cluster_name))
608
609     if not config.get('skip_mgr_daemons', False):
610         log.info('Setting up mgr nodes...')
611         mgrs = ctx.cluster.only(teuthology.is_type('mgr', cluster_name))
612         for remote, roles_for_host in mgrs.remotes.iteritems():
613             for role in teuthology.cluster_roles_of_type(roles_for_host, 'mgr',
614                                                          cluster_name):
615                 _, _, id_ = teuthology.split_role(role)
616                 mgr_dir = '/var/lib/ceph/mgr/{cluster}-{id}'.format(
617                     cluster=cluster_name,
618                     id=id_,
619                 )
620                 remote.run(
621                     args=[
622                         'sudo',
623                         'mkdir',
624                         '-p',
625                         mgr_dir,
626                         run.Raw('&&'),
627                         'sudo',
628                         'adjust-ulimits',
629                         'ceph-coverage',
630                         coverage_dir,
631                         'ceph-authtool',
632                         '--create-keyring',
633                         '--gen-key',
634                         '--name=mgr.{id}'.format(id=id_),
635                         mgr_dir + '/keyring',
636                     ],
637                 )
638
639     log.info('Setting up mds nodes...')
640     mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
641     for remote, roles_for_host in mdss.remotes.iteritems():
642         for role in teuthology.cluster_roles_of_type(roles_for_host, 'mds',
643                                                      cluster_name):
644             _, _, id_ = teuthology.split_role(role)
645             mds_dir = '/var/lib/ceph/mds/{cluster}-{id}'.format(
646                 cluster=cluster_name,
647                 id=id_,
648             )
649             remote.run(
650                 args=[
651                     'sudo',
652                     'mkdir',
653                     '-p',
654                     mds_dir,
655                     run.Raw('&&'),
656                     'sudo',
657                     'adjust-ulimits',
658                     'ceph-coverage',
659                     coverage_dir,
660                     'ceph-authtool',
661                     '--create-keyring',
662                     '--gen-key',
663                     '--name=mds.{id}'.format(id=id_),
664                     mds_dir + '/keyring',
665                 ],
666             )
667
668     cclient.create_keyring(ctx, cluster_name)
669     log.info('Running mkfs on osd nodes...')
670
671     if not hasattr(ctx, 'disk_config'):
672         ctx.disk_config = argparse.Namespace()
673     if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev'):
674         ctx.disk_config.remote_to_roles_to_dev = {}
675     if not hasattr(ctx.disk_config, 'remote_to_roles_to_journals'):
676         ctx.disk_config.remote_to_roles_to_journals = {}
677     if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_mount_options'):
678         ctx.disk_config.remote_to_roles_to_dev_mount_options = {}
679     if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_fstype'):
680         ctx.disk_config.remote_to_roles_to_dev_fstype = {}
681
682     teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_dev, remote_to_roles_to_devs)
683     teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_journals, remote_to_roles_to_journals)
684
685     log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r=str(ctx.disk_config.remote_to_roles_to_dev)))
686     for remote, roles_for_host in osds.remotes.iteritems():
687         roles_to_devs = remote_to_roles_to_devs[remote]
688         roles_to_journals = remote_to_roles_to_journals[remote]
689
690         for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
691             _, _, id_ = teuthology.split_role(role)
692             mnt_point = '/var/lib/ceph/osd/{cluster}-{id}'.format(cluster=cluster_name, id=id_)
693             remote.run(
694                 args=[
695                     'sudo',
696                     'mkdir',
697                     '-p',
698                     mnt_point,
699                 ])
700             log.info(str(roles_to_devs))
701             log.info(str(roles_to_journals))
702             log.info(role)
703             if roles_to_devs.get(role):
704                 dev = roles_to_devs[role]
705                 fs = config.get('fs')
706                 package = None
707                 mkfs_options = config.get('mkfs_options')
708                 mount_options = config.get('mount_options')
709                 if fs == 'btrfs':
710                     # package = 'btrfs-tools'
711                     if mount_options is None:
712                         mount_options = ['noatime', 'user_subvol_rm_allowed']
713                     if mkfs_options is None:
714                         mkfs_options = ['-m', 'single',
715                                         '-l', '32768',
716                                         '-n', '32768']
717                 if fs == 'xfs':
718                     # package = 'xfsprogs'
719                     if mount_options is None:
720                         mount_options = ['noatime']
721                     if mkfs_options is None:
722                         mkfs_options = ['-f', '-i', 'size=2048']
723                 if fs == 'ext4' or fs == 'ext3':
724                     if mount_options is None:
725                         mount_options = ['noatime', 'user_xattr']
726
727                 if mount_options is None:
728                     mount_options = []
729                 if mkfs_options is None:
730                     mkfs_options = []
731                 mkfs = ['mkfs.%s' % fs] + mkfs_options
732                 log.info('%s on %s on %s' % (mkfs, dev, remote))
733                 if package is not None:
734                     remote.run(
735                         args=[
736                             'sudo',
737                             'apt-get', 'install', '-y', package
738                         ],
739                         stdout=StringIO(),
740                     )
741
742                 try:
743                     remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
744                 except run.CommandFailedError:
745                     # Newer btfs-tools doesn't prompt for overwrite, use -f
746                     if '-f' not in mount_options:
747                         mkfs_options.append('-f')
748                         mkfs = ['mkfs.%s' % fs] + mkfs_options
749                         log.info('%s on %s on %s' % (mkfs, dev, remote))
750                     remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
751
752                 log.info('mount %s on %s -o %s' % (dev, remote,
753                                                    ','.join(mount_options)))
754                 remote.run(
755                     args=[
756                         'sudo',
757                         'mount',
758                         '-t', fs,
759                         '-o', ','.join(mount_options),
760                         dev,
761                         mnt_point,
762                     ]
763                 )
764                 remote.run(
765                     args=[
766                         'sudo', '/sbin/restorecon', mnt_point,
767                     ],
768                     check_status=False,
769                 )
770                 if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options:
771                     ctx.disk_config.remote_to_roles_to_dev_mount_options[remote] = {}
772                 ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][role] = mount_options
773                 if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype:
774                     ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {}
775                 ctx.disk_config.remote_to_roles_to_dev_fstype[remote][role] = fs
776                 devs_to_clean[remote].append(mnt_point)
777
778         for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
779             _, _, id_ = teuthology.split_role(role)
780             remote.run(
781                 args=[
782                     'sudo',
783                     'MALLOC_CHECK_=3',
784                     'adjust-ulimits',
785                     'ceph-coverage',
786                     coverage_dir,
787                     'ceph-osd',
788                     '--cluster',
789                     cluster_name,
790                     '--mkfs',
791                     '--mkkey',
792                     '-i', id_,
793                     '--monmap', monmap_path,
794                 ],
795             )
796
797     log.info('Reading keys from all nodes...')
798     keys_fp = StringIO()
799     keys = []
800     for remote, roles_for_host in ctx.cluster.remotes.iteritems():
801         for type_ in ['mgr',  'mds', 'osd']:
802             if type_ == 'mgr' and config.get('skip_mgr_daemons', False):
803                 continue
804             for role in teuthology.cluster_roles_of_type(roles_for_host, type_, cluster_name):
805                 _, _, id_ = teuthology.split_role(role)
806                 data = teuthology.get_file(
807                     remote=remote,
808                     path='/var/lib/ceph/{type}/{cluster}-{id}/keyring'.format(
809                         type=type_,
810                         id=id_,
811                         cluster=cluster_name,
812                     ),
813                     sudo=True,
814                 )
815                 keys.append((type_, id_, data))
816                 keys_fp.write(data)
817     for remote, roles_for_host in ctx.cluster.remotes.iteritems():
818         for role in teuthology.cluster_roles_of_type(roles_for_host, 'client', cluster_name):
819             _, _, id_ = teuthology.split_role(role)
820             data = teuthology.get_file(
821                 remote=remote,
822                 path='/etc/ceph/{cluster}.client.{id}.keyring'.format(id=id_, cluster=cluster_name)
823             )
824             keys.append(('client', id_, data))
825             keys_fp.write(data)
826
827     log.info('Adding keys to all mons...')
828     writes = mons.run(
829         args=[
830             'sudo', 'tee', '-a',
831             keyring_path,
832         ],
833         stdin=run.PIPE,
834         wait=False,
835         stdout=StringIO(),
836     )
837     keys_fp.seek(0)
838     teuthology.feed_many_stdins_and_close(keys_fp, writes)
839     run.wait(writes)
840     for type_, id_, data in keys:
841         run.wait(
842             mons.run(
843                 args=[
844                          'sudo',
845                          'adjust-ulimits',
846                          'ceph-coverage',
847                          coverage_dir,
848                          'ceph-authtool',
849                          keyring_path,
850                          '--name={type}.{id}'.format(
851                              type=type_,
852                              id=id_,
853                          ),
854                      ] + list(generate_caps(type_)),
855                 wait=False,
856             ),
857         )
858
859     log.info('Running mkfs on mon nodes...')
860     for remote, roles_for_host in mons.remotes.iteritems():
861         for role in teuthology.cluster_roles_of_type(roles_for_host, 'mon', cluster_name):
862             _, _, id_ = teuthology.split_role(role)
863             remote.run(
864                 args=[
865                     'sudo',
866                     'mkdir',
867                     '-p',
868                     '/var/lib/ceph/mon/{cluster}-{id}'.format(id=id_, cluster=cluster_name),
869                 ],
870             )
871             remote.run(
872                 args=[
873                     'sudo',
874                     'adjust-ulimits',
875                     'ceph-coverage',
876                     coverage_dir,
877                     'ceph-mon',
878                     '--cluster', cluster_name,
879                     '--mkfs',
880                     '-i', id_,
881                     '--monmap', monmap_path,
882                     '--keyring', keyring_path,
883                 ],
884             )
885
886     run.wait(
887         mons.run(
888             args=[
889                 'rm',
890                 '--',
891                 monmap_path,
892             ],
893             wait=False,
894         ),
895     )
896
897     try:
898         yield
899     except Exception:
900         # we need to know this below
901         ctx.summary['success'] = False
902         raise
903     finally:
904         (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
905
906         log.info('Checking cluster log for badness...')
907
908         def first_in_ceph_log(pattern, excludes):
909             """
910             Find the first occurence of the pattern specified in the Ceph log,
911             Returns None if none found.
912
913             :param pattern: Pattern scanned for.
914             :param excludes: Patterns to ignore.
915             :return: First line of text (or None if not found)
916             """
917             args = [
918                 'sudo',
919                 'egrep', pattern,
920                 '/var/log/ceph/{cluster}.log'.format(cluster=cluster_name),
921             ]
922             for exclude in excludes:
923                 args.extend([run.Raw('|'), 'egrep', '-v', exclude])
924             args.extend([
925                 run.Raw('|'), 'head', '-n', '1',
926             ])
927             r = mon0_remote.run(
928                 stdout=StringIO(),
929                 args=args,
930             )
931             stdout = r.stdout.getvalue()
932             if stdout != '':
933                 return stdout
934             return None
935
936         if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
937                              config['log_whitelist']) is not None:
938             log.warning('Found errors (ERR|WRN|SEC) in cluster log')
939             ctx.summary['success'] = False
940             # use the most severe problem as the failure reason
941             if 'failure_reason' not in ctx.summary:
942                 for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
943                     match = first_in_ceph_log(pattern, config['log_whitelist'])
944                     if match is not None:
945                         ctx.summary['failure_reason'] = \
946                             '"{match}" in cluster log'.format(
947                                 match=match.rstrip('\n'),
948                             )
949                         break
950
951         for remote, dirs in devs_to_clean.iteritems():
952             for dir_ in dirs:
953                 log.info('Unmounting %s on %s' % (dir_, remote))
954                 try:
955                     remote.run(
956                         args=[
957                             'sync',
958                             run.Raw('&&'),
959                             'sudo',
960                             'umount',
961                             '-f',
962                             dir_
963                         ]
964                     )
965                 except Exception as e:
966                     remote.run(args=[
967                         'sudo',
968                         run.Raw('PATH=/usr/sbin:$PATH'),
969                         'lsof',
970                         run.Raw(';'),
971                         'ps', 'auxf',
972                     ])
973                     raise e
974
975         if config.get('tmpfs_journal'):
976             log.info('tmpfs journal enabled - unmounting tmpfs at /mnt')
977             for remote, roles_for_host in osds.remotes.iteritems():
978                 remote.run(
979                     args=['sudo', 'umount', '-f', '/mnt'],
980                     check_status=False,
981                 )
982
983         if ctx.archive is not None and \
984                 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
985
986             # archive mon data, too
987             log.info('Archiving mon data...')
988             path = os.path.join(ctx.archive, 'data')
989             try:
990                 os.makedirs(path)
991             except OSError as e:
992                 if e.errno == errno.EEXIST:
993                     pass
994                 else:
995                     raise
996             for remote, roles in mons.remotes.iteritems():
997                 for role in roles:
998                     is_mon = teuthology.is_type('mon', cluster_name)
999                     if is_mon(role):
1000                         _, _, id_ = teuthology.split_role(role)
1001                         mon_dir = '/var/lib/ceph/mon/' + \
1002                                   '{0}-{1}'.format(cluster_name, id_)
1003                         teuthology.pull_directory_tarball(
1004                             remote,
1005                             mon_dir,
1006                             path + '/' + role + '.tgz')
1007
1008         log.info('Cleaning ceph cluster...')
1009         run.wait(
1010             ctx.cluster.run(
1011                 args=[
1012                     'sudo',
1013                     'rm',
1014                     '-rf',
1015                     '--',
1016                     conf_path,
1017                     keyring_path,
1018                     data_dir,
1019                     monmap_path,
1020                     run.Raw('{tdir}/../*.pid'.format(tdir=testdir)),
1021                 ],
1022                 wait=False,
1023             ),
1024         )
1025
1026
1027 def osd_scrub_pgs(ctx, config):
1028     """
1029     Scrub pgs when we exit.
1030
1031     First make sure all pgs are active and clean.
1032     Next scrub all osds.
1033     Then periodically check until all pgs have scrub time stamps that
1034     indicate the last scrub completed.  Time out if no progess is made
1035     here after two minutes.
1036     """
1037     retries = 40
1038     delays = 20
1039     cluster_name = config['cluster']
1040     manager = ctx.managers[cluster_name]
1041     all_clean = False
1042     for _ in range(0, retries):
1043         stats = manager.get_pg_stats()
1044         bad = [stat['pgid'] for stat in stats if 'active+clean' not in stat['state']]
1045         if not bad:
1046             all_clean = True
1047             break
1048         log.info(
1049             "Waiting for all PGs to be active and clean, waiting on %s" % bad)
1050         time.sleep(delays)
1051     if not all_clean:
1052         raise RuntimeError("Scrubbing terminated -- not all pgs were active and clean.")
1053     check_time_now = time.localtime()
1054     time.sleep(1)
1055     all_roles = teuthology.all_roles(ctx.cluster)
1056     for role in teuthology.cluster_roles_of_type(all_roles, 'osd', cluster_name):
1057         log.info("Scrubbing {osd}".format(osd=role))
1058         _, _, id_ = teuthology.split_role(role)
1059         # allow this to fail; in certain cases the OSD might not be up
1060         # at this point.  we will catch all pgs below.
1061         try:
1062             manager.raw_cluster_cmd('osd', 'deep-scrub', id_)
1063         except run.CommandFailedError:
1064             pass
1065     prev_good = 0
1066     gap_cnt = 0
1067     loop = True
1068     while loop:
1069         stats = manager.get_pg_stats()
1070         timez = [(stat['pgid'],stat['last_scrub_stamp']) for stat in stats]
1071         loop = False
1072         thiscnt = 0
1073         for (pgid, tmval) in timez:
1074             pgtm = time.strptime(tmval[0:tmval.find('.')], '%Y-%m-%d %H:%M:%S')
1075             if pgtm > check_time_now:
1076                 thiscnt += 1
1077             else:
1078                 log.info('pgid %s last_scrub_stamp %s %s <= %s', pgid, tmval, pgtm, check_time_now)
1079                 loop = True
1080         if thiscnt > prev_good:
1081             prev_good = thiscnt
1082             gap_cnt = 0
1083         else:
1084             gap_cnt += 1
1085             if gap_cnt % 6 == 0:
1086                 for (pgid, tmval) in timez:
1087                     # re-request scrub every so often in case the earlier
1088                     # request was missed.  do not do it everytime because
1089                     # the scrub may be in progress or not reported yet and
1090                     # we will starve progress.
1091                     manager.raw_cluster_cmd('pg', 'deep-scrub', pgid)
1092             if gap_cnt > retries:
1093                 raise RuntimeError('Exiting scrub checking -- not all pgs scrubbed.')
1094         if loop:
1095             log.info('Still waiting for all pgs to be scrubbed.')
1096             time.sleep(delays)
1097
1098
1099 @contextlib.contextmanager
1100 def run_daemon(ctx, config, type_):
1101     """
1102     Run daemons for a role type.  Handle the startup and termination of a a daemon.
1103     On startup -- set coverages, cpu_profile, valgrind values for all remotes,
1104     and a max_mds value for one mds.
1105     On cleanup -- Stop all existing daemons of this type.
1106
1107     :param ctx: Context
1108     :param config: Configuration
1109     :paran type_: Role type
1110     """
1111     cluster_name = config['cluster']
1112     log.info('Starting %s daemons in cluster %s...', type_, cluster_name)
1113     testdir = teuthology.get_testdir(ctx)
1114     daemons = ctx.cluster.only(teuthology.is_type(type_, cluster_name))
1115
1116     # check whether any daemons if this type are configured
1117     if daemons is None:
1118         return
1119     coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
1120
1121     daemon_signal = 'kill'
1122     if config.get('coverage') or config.get('valgrind') is not None:
1123         daemon_signal = 'term'
1124
1125     # create osds in order.  (this only matters for pre-luminous, which might
1126     # be hammer, which doesn't take an id_ argument to legacy 'osd create').
1127     osd_uuids  = {}
1128     for remote, roles_for_host in daemons.remotes.iteritems():
1129         is_type_ = teuthology.is_type(type_, cluster_name)
1130         for role in roles_for_host:
1131             if not is_type_(role):
1132                 continue
1133             _, _, id_ = teuthology.split_role(role)
1134
1135
1136             if type_ == 'osd':
1137                 datadir='/var/lib/ceph/osd/{cluster}-{id}'.format(
1138                     cluster=cluster_name, id=id_)
1139                 osd_uuid = teuthology.get_file(
1140                     remote=remote,
1141                     path=datadir + '/fsid',
1142                     sudo=True,
1143                 ).strip()
1144                 osd_uuids[id_] = osd_uuid
1145     for osd_id in range(len(osd_uuids)):
1146         id_ = str(osd_id)
1147         osd_uuid = osd_uuids.get(id_)
1148         try:
1149             remote.run(
1150                 args=[
1151                 'sudo', 'ceph', '--cluster', cluster_name,
1152                     'osd', 'new', osd_uuid, id_,
1153                 ]
1154             )
1155         except:
1156             # fallback to pre-luminous (hammer or jewel)
1157             remote.run(
1158                 args=[
1159                 'sudo', 'ceph', '--cluster', cluster_name,
1160                     'osd', 'create', osd_uuid,
1161                 ]
1162             )
1163             if config.get('add_osds_to_crush'):
1164                 remote.run(
1165                 args=[
1166                     'sudo', 'ceph', '--cluster', cluster_name,
1167                     'osd', 'crush', 'create-or-move', 'osd.' + id_,
1168                     '1.0', 'host=localhost', 'root=default',
1169                 ]
1170             )
1171
1172     for remote, roles_for_host in daemons.remotes.iteritems():
1173         is_type_ = teuthology.is_type(type_, cluster_name)
1174         for role in roles_for_host:
1175             if not is_type_(role):
1176                 continue
1177             _, _, id_ = teuthology.split_role(role)
1178
1179             run_cmd = [
1180                 'sudo',
1181                 'adjust-ulimits',
1182                 'ceph-coverage',
1183                 coverage_dir,
1184                 'daemon-helper',
1185                 daemon_signal,
1186             ]
1187             run_cmd_tail = [
1188                 'ceph-%s' % (type_),
1189                 '-f',
1190                 '--cluster', cluster_name,
1191                 '-i', id_]
1192
1193             if type_ in config.get('cpu_profile', []):
1194                 profile_path = '/var/log/ceph/profiling-logger/%s.prof' % (role)
1195                 run_cmd.extend(['env', 'CPUPROFILE=%s' % profile_path])
1196
1197             if config.get('valgrind') is not None:
1198                 valgrind_args = None
1199                 if type_ in config['valgrind']:
1200                     valgrind_args = config['valgrind'][type_]
1201                 if role in config['valgrind']:
1202                     valgrind_args = config['valgrind'][role]
1203                 run_cmd = teuthology.get_valgrind_args(testdir, role,
1204                                                        run_cmd,
1205                                                        valgrind_args)
1206
1207             run_cmd.extend(run_cmd_tail)
1208
1209             # always register mgr; don't necessarily start
1210             ctx.daemons.register_daemon(
1211                 remote, type_, id_,
1212                 cluster=cluster_name,
1213                 args=run_cmd,
1214                 logger=log.getChild(role),
1215                 stdin=run.PIPE,
1216                 wait=False
1217             )
1218             if type_ != 'mgr' or not config.get('skip_mgr_daemons', False):
1219                 role = cluster_name + '.' + type_
1220                 ctx.daemons.get_daemon(type_, id_, cluster_name).restart()
1221
1222     try:
1223         yield
1224     finally:
1225         teuthology.stop_daemons_of_type(ctx, type_, cluster_name)
1226
1227
1228 def healthy(ctx, config):
1229     """
1230     Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.
1231
1232     :param ctx: Context
1233     :param config: Configuration
1234     """
1235     config = config if isinstance(config, dict) else dict()
1236     cluster_name = config.get('cluster', 'ceph')
1237     log.info('Waiting until %s daemons up and pgs clean...', cluster_name)
1238     manager = ctx.managers[cluster_name]
1239     try:
1240         manager.wait_for_mgr_available(timeout=30)
1241     except (run.CommandFailedError, AssertionError) as e:
1242         log.info('ignoring mgr wait error, probably testing upgrade: %s', e)
1243
1244     firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1245     (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1246     teuthology.wait_until_osds_up(
1247         ctx,
1248         cluster=ctx.cluster,
1249         remote=mon0_remote,
1250         ceph_cluster=cluster_name,
1251     )
1252
1253     try:
1254         manager.flush_all_pg_stats()
1255     except (run.CommandFailedError, Exception) as e:
1256         log.info('ignoring flush pg stats error, probably testing upgrade: %s', e)
1257     manager.wait_for_clean()
1258
1259     log.info('Waiting until ceph cluster %s is healthy...', cluster_name)
1260     teuthology.wait_until_healthy(
1261         ctx,
1262         remote=mon0_remote,
1263         ceph_cluster=cluster_name,
1264     )
1265
1266     if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes:
1267         # Some MDSs exist, wait for them to be healthy
1268         ceph_fs = Filesystem(ctx) # TODO: make Filesystem cluster-aware
1269         ceph_fs.wait_for_daemons(timeout=300)
1270
1271
1272 def wait_for_osds_up(ctx, config):
1273     """
1274     Wait for all osd's to come up.
1275
1276     :param ctx: Context
1277     :param config: Configuration
1278     """
1279     log.info('Waiting until ceph osds are all up...')
1280     cluster_name = config.get('cluster', 'ceph')
1281     firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1282     (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1283     teuthology.wait_until_osds_up(
1284         ctx,
1285         cluster=ctx.cluster,
1286         remote=mon0_remote
1287     )
1288
1289
1290 def wait_for_mon_quorum(ctx, config):
1291     """
1292     Check renote ceph status until all monitors are up.
1293
1294     :param ctx: Context
1295     :param config: Configuration
1296     """
1297     if isinstance(config, dict):
1298         mons = config['daemons']
1299         cluster_name = config.get('cluster', 'ceph')
1300     else:
1301         assert isinstance(config, list)
1302         mons = config
1303         cluster_name = 'ceph'
1304     firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1305     (remote,) = ctx.cluster.only(firstmon).remotes.keys()
1306     with contextutil.safe_while(sleep=10, tries=60,
1307                                 action='wait for monitor quorum') as proceed:
1308         while proceed():
1309             r = remote.run(
1310                 args=[
1311                     'sudo',
1312                     'ceph',
1313                     'quorum_status',
1314                 ],
1315                 stdout=StringIO(),
1316                 logger=log.getChild('quorum_status'),
1317             )
1318             j = json.loads(r.stdout.getvalue())
1319             q = j.get('quorum_names', [])
1320             log.debug('Quorum: %s', q)
1321             if sorted(q) == sorted(mons):
1322                 break
1323
1324
1325 def created_pool(ctx, config):
1326     """
1327     Add new pools to the dictionary of pools that the ceph-manager
1328     knows about.
1329     """
1330     for new_pool in config:
1331         if new_pool not in ctx.managers['ceph'].pools:
1332             ctx.managers['ceph'].pools[new_pool] = ctx.managers['ceph'].get_pool_property(
1333                 new_pool, 'pg_num')
1334
1335
1336 @contextlib.contextmanager
1337 def restart(ctx, config):
1338     """
1339    restart ceph daemons
1340
1341    For example::
1342       tasks:
1343       - ceph.restart: [all]
1344
1345    For example::
1346       tasks:
1347       - ceph.restart: [osd.0, mon.1, mds.*]
1348
1349    or::
1350
1351       tasks:
1352       - ceph.restart:
1353           daemons: [osd.0, mon.1]
1354           wait-for-healthy: false
1355           wait-for-osds-up: true
1356
1357     :param ctx: Context
1358     :param config: Configuration
1359     """
1360     if config is None:
1361         config = {}
1362     elif isinstance(config, list):
1363         config = {'daemons': config}
1364
1365     daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1366     clusters = set()
1367     for role in daemons:
1368         cluster, type_, id_ = teuthology.split_role(role)
1369         ctx.daemons.get_daemon(type_, id_, cluster).restart()
1370         clusters.add(cluster)
1371
1372     manager = ctx.managers['ceph']
1373     for dmon in daemons:
1374         if '.' in dmon:
1375             dm_parts = dmon.split('.')
1376             if dm_parts[1].isdigit():
1377                 if dm_parts[0] == 'osd':
1378                     manager.mark_down_osd(int(dm_parts[1]))
1379
1380     if config.get('wait-for-healthy', True):
1381         for cluster in clusters:
1382             healthy(ctx=ctx, config=dict(cluster=cluster))
1383     if config.get('wait-for-osds-up', False):
1384         for cluster in clusters:
1385             wait_for_osds_up(ctx=ctx, config=dict(cluster=cluster))
1386     yield
1387
1388
1389 @contextlib.contextmanager
1390 def stop(ctx, config):
1391     """
1392     Stop ceph daemons
1393
1394     For example::
1395       tasks:
1396       - ceph.stop: [mds.*]
1397
1398       tasks:
1399       - ceph.stop: [osd.0, osd.2]
1400
1401       tasks:
1402       - ceph.stop:
1403           daemons: [osd.0, osd.2]
1404
1405     """
1406     if config is None:
1407         config = {}
1408     elif isinstance(config, list):
1409         config = {'daemons': config}
1410
1411     daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1412     for role in daemons:
1413         cluster, type_, id_ = teuthology.split_role(role)
1414         ctx.daemons.get_daemon(type_, id_, cluster).stop()
1415
1416     yield
1417
1418
1419 @contextlib.contextmanager
1420 def wait_for_failure(ctx, config):
1421     """
1422     Wait for a failure of a ceph daemon
1423
1424     For example::
1425       tasks:
1426       - ceph.wait_for_failure: [mds.*]
1427
1428       tasks:
1429       - ceph.wait_for_failure: [osd.0, osd.2]
1430
1431       tasks:
1432       - ceph.wait_for_failure:
1433           daemons: [osd.0, osd.2]
1434
1435     """
1436     if config is None:
1437         config = {}
1438     elif isinstance(config, list):
1439         config = {'daemons': config}
1440
1441     daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1442     for role in daemons:
1443         cluster, type_, id_ = teuthology.split_role(role)
1444         try:
1445             ctx.daemons.get_daemon(type_, id_, cluster).wait()
1446         except:
1447             log.info('Saw expected daemon failure.  Continuing.')
1448             pass
1449         else:
1450             raise RuntimeError('daemon %s did not fail' % role)
1451
1452     yield
1453
1454
1455 def validate_config(ctx, config):
1456     """
1457     Perform some simple validation on task configuration.
1458     Raises exceptions.ConfigError if an error is found.
1459     """
1460     # check for osds from multiple clusters on the same host
1461     for remote, roles_for_host in ctx.cluster.remotes.items():
1462         last_cluster = None
1463         last_role = None
1464         for role in roles_for_host:
1465             role_cluster, role_type, _ = teuthology.split_role(role)
1466             if role_type != 'osd':
1467                 continue
1468             if last_cluster and last_cluster != role_cluster:
1469                 msg = "Host should not have osds (%s and %s) from multiple clusters" % (
1470                     last_role, role)
1471                 raise exceptions.ConfigError(msg)
1472             last_cluster = role_cluster
1473             last_role = role
1474
1475
1476 @contextlib.contextmanager
1477 def task(ctx, config):
1478     """
1479     Set up and tear down a Ceph cluster.
1480
1481     For example::
1482
1483         tasks:
1484         - ceph:
1485         - interactive:
1486
1487     You can also specify what branch to run::
1488
1489         tasks:
1490         - ceph:
1491             branch: foo
1492
1493     Or a tag::
1494
1495         tasks:
1496         - ceph:
1497             tag: v0.42.13
1498
1499     Or a sha1::
1500
1501         tasks:
1502         - ceph:
1503             sha1: 1376a5ab0c89780eab39ffbbe436f6a6092314ed
1504
1505     Or a local source dir::
1506
1507         tasks:
1508         - ceph:
1509             path: /home/sage/ceph
1510
1511     To capture code coverage data, use::
1512
1513         tasks:
1514         - ceph:
1515             coverage: true
1516
1517     To use btrfs, ext4, or xfs on the target's scratch disks, use::
1518
1519         tasks:
1520         - ceph:
1521             fs: xfs
1522             mkfs_options: [-b,size=65536,-l,logdev=/dev/sdc1]
1523             mount_options: [nobarrier, inode64]
1524
1525     Note, this will cause the task to check the /scratch_devs file on each node
1526     for available devices.  If no such file is found, /dev/sdb will be used.
1527
1528     To run some daemons under valgrind, include their names
1529     and the tool/args to use in a valgrind section::
1530
1531         tasks:
1532         - ceph:
1533           valgrind:
1534             mds.1: --tool=memcheck
1535             osd.1: [--tool=memcheck, --leak-check=no]
1536
1537     Those nodes which are using memcheck or valgrind will get
1538     checked for bad results.
1539
1540     To adjust or modify config options, use::
1541
1542         tasks:
1543         - ceph:
1544             conf:
1545               section:
1546                 key: value
1547
1548     For example::
1549
1550         tasks:
1551         - ceph:
1552             conf:
1553               mds.0:
1554                 some option: value
1555                 other key: other value
1556               client.0:
1557                 debug client: 10
1558                 debug ms: 1
1559
1560     By default, the cluster log is checked for errors and warnings,
1561     and the run marked failed if any appear. You can ignore log
1562     entries by giving a list of egrep compatible regexes, i.e.:
1563
1564         tasks:
1565         - ceph:
1566             log-whitelist: ['foo.*bar', 'bad message']
1567
1568     To run multiple ceph clusters, use multiple ceph tasks, and roles
1569     with a cluster name prefix, e.g. cluster1.client.0. Roles with no
1570     cluster use the default cluster name, 'ceph'. OSDs from separate
1571     clusters must be on separate hosts. Clients and non-osd daemons
1572     from multiple clusters may be colocated. For each cluster, add an
1573     instance of the ceph task with the cluster name specified, e.g.::
1574
1575         roles:
1576         - [mon.a, osd.0, osd.1]
1577         - [backup.mon.a, backup.osd.0, backup.osd.1]
1578         - [client.0, backup.client.0]
1579         tasks:
1580         - ceph:
1581             cluster: ceph
1582         - ceph:
1583             cluster: backup
1584
1585     :param ctx: Context
1586     :param config: Configuration
1587
1588     """
1589     if config is None:
1590         config = {}
1591     assert isinstance(config, dict), \
1592         "task ceph only supports a dictionary for configuration"
1593
1594     overrides = ctx.config.get('overrides', {})
1595     teuthology.deep_merge(config, overrides.get('ceph', {}))
1596
1597     first_ceph_cluster = False
1598     if not hasattr(ctx, 'daemons'):
1599         first_ceph_cluster = True
1600         ctx.daemons = DaemonGroup()
1601
1602     testdir = teuthology.get_testdir(ctx)
1603     if config.get('coverage'):
1604         coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
1605         log.info('Creating coverage directory...')
1606         run.wait(
1607             ctx.cluster.run(
1608                 args=[
1609                     'install', '-d', '-m0755', '--',
1610                     coverage_dir,
1611                 ],
1612                 wait=False,
1613             )
1614         )
1615
1616     if 'cluster' not in config:
1617         config['cluster'] = 'ceph'
1618
1619     validate_config(ctx, config)
1620
1621     subtasks = []
1622     if first_ceph_cluster:
1623         # these tasks handle general log setup and parsing on all hosts,
1624         # so they should only be run once
1625         subtasks = [
1626             lambda: ceph_log(ctx=ctx, config=None),
1627             lambda: valgrind_post(ctx=ctx, config=config),
1628         ]
1629
1630     subtasks += [
1631         lambda: cluster(ctx=ctx, config=dict(
1632             conf=config.get('conf', {}),
1633             fs=config.get('fs', 'xfs'),
1634             mkfs_options=config.get('mkfs_options', None),
1635             mount_options=config.get('mount_options', None),
1636             block_journal=config.get('block_journal', None),
1637             tmpfs_journal=config.get('tmpfs_journal', None),
1638             skip_mgr_daemons=config.get('skip_mgr_daemons', False),
1639             log_whitelist=config.get('log-whitelist', []),
1640             cpu_profile=set(config.get('cpu_profile', []),),
1641             cluster=config['cluster'],
1642         )),
1643         lambda: run_daemon(ctx=ctx, config=config, type_='mon'),
1644         lambda: run_daemon(ctx=ctx, config=config, type_='mgr'),
1645         lambda: crush_setup(ctx=ctx, config=config),
1646         lambda: run_daemon(ctx=ctx, config=config, type_='osd'),
1647         lambda: create_rbd_pool(ctx=ctx, config=config),
1648         lambda: cephfs_setup(ctx=ctx, config=config),
1649         lambda: run_daemon(ctx=ctx, config=config, type_='mds'),
1650     ]
1651
1652     with contextutil.nested(*subtasks):
1653         first_mon = teuthology.get_first_mon(ctx, config, config['cluster'])
1654         (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
1655         if not hasattr(ctx, 'managers'):
1656             ctx.managers = {}
1657         ctx.managers[config['cluster']] = CephManager(
1658             mon,
1659             ctx=ctx,
1660             logger=log.getChild('ceph_manager.' + config['cluster']),
1661             cluster=config['cluster'],
1662         )
1663
1664         try:
1665             if config.get('wait-for-healthy', True):
1666                 healthy(ctx=ctx, config=dict(cluster=config['cluster']))
1667
1668             yield
1669         finally:
1670             if config.get('wait-for-scrub', True):
1671                 osd_scrub_pgs(ctx, config)
1672
1673             # stop logging health to clog during shutdown, or else we generate
1674             # a bunch of scary messages unrelated to our actual run.
1675             firstmon = teuthology.get_first_mon(ctx, config, config['cluster'])
1676             (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1677             mon0_remote.run(
1678                 args=[
1679                     'sudo',
1680                     'ceph',
1681                     '--cluster', config['cluster'],
1682                     'tell',
1683                     'mon.*',
1684                     'injectargs',
1685                     '--',
1686                     '--no-mon-health-to-clog',
1687                 ]
1688             )