X-Git-Url: https://gerrit.opnfv.org/gerrit/gitweb?a=blobdiff_plain;f=src%2Fceph%2Fqa%2Ftasks%2Fthrashosds.py;fp=src%2Fceph%2Fqa%2Ftasks%2Fthrashosds.py;h=0000000000000000000000000000000000000000;hb=7da45d65be36d36b880cc55c5036e96c24b53f00;hp=420b7355908d52832d46330637a00d603a8a4780;hpb=691462d09d0987b47e112d6ee8740375df3c51b2;p=stor4nfv.git diff --git a/src/ceph/qa/tasks/thrashosds.py b/src/ceph/qa/tasks/thrashosds.py deleted file mode 100644 index 420b735..0000000 --- a/src/ceph/qa/tasks/thrashosds.py +++ /dev/null @@ -1,204 +0,0 @@ -""" -Thrash -- Simulate random osd failures. -""" -import contextlib -import logging -import ceph_manager -from teuthology import misc as teuthology - - -log = logging.getLogger(__name__) - -@contextlib.contextmanager -def task(ctx, config): - """ - "Thrash" the OSDs by randomly marking them out/down (and then back - in) until the task is ended. This loops, and every op_delay - seconds it randomly chooses to add or remove an OSD (even odds) - unless there are fewer than min_out OSDs out of the cluster, or - more than min_in OSDs in the cluster. - - All commands are run on mon0 and it stops when __exit__ is called. - - The config is optional, and is a dict containing some or all of: - - cluster: (default 'ceph') the name of the cluster to thrash - - min_in: (default 4) the minimum number of OSDs to keep in the - cluster - - min_out: (default 0) the minimum number of OSDs to keep out of the - cluster - - op_delay: (5) the length of time to sleep between changing an - OSD's status - - min_dead: (0) minimum number of osds to leave down/dead. - - max_dead: (0) maximum number of osds to leave down/dead before waiting - for clean. This should probably be num_replicas - 1. - - clean_interval: (60) the approximate length of time to loop before - waiting until the cluster goes clean. (In reality this is used - to probabilistically choose when to wait, and the method used - makes it closer to -- but not identical to -- the half-life.) - - scrub_interval: (-1) the approximate length of time to loop before - waiting until a scrub is performed while cleaning. (In reality - this is used to probabilistically choose when to wait, and it - only applies to the cases where cleaning is being performed). - -1 is used to indicate that no scrubbing will be done. - - chance_down: (0.4) the probability that the thrasher will mark an - OSD down rather than marking it out. (The thrasher will not - consider that OSD out of the cluster, since presently an OSD - wrongly marked down will mark itself back up again.) This value - can be either an integer (eg, 75) or a float probability (eg - 0.75). - - chance_test_min_size: (0) chance to run test_pool_min_size, - which: - - kills all but one osd - - waits - - kills that osd - - revives all other osds - - verifies that the osds fully recover - - timeout: (360) the number of seconds to wait for the cluster - to become clean after each cluster change. If this doesn't - happen within the timeout, an exception will be raised. - - revive_timeout: (150) number of seconds to wait for an osd asok to - appear after attempting to revive the osd - - thrash_primary_affinity: (true) randomly adjust primary-affinity - - chance_pgnum_grow: (0) chance to increase a pool's size - chance_pgpnum_fix: (0) chance to adjust pgpnum to pg for a pool - pool_grow_by: (10) amount to increase pgnum by - max_pgs_per_pool_osd: (1200) don't expand pools past this size per osd - - pause_short: (3) duration of short pause - pause_long: (80) duration of long pause - pause_check_after: (50) assert osd down after this long - chance_inject_pause_short: (1) chance of injecting short stall - chance_inject_pause_long: (0) chance of injecting long stall - - clean_wait: (0) duration to wait before resuming thrashing once clean - - sighup_delay: (0.1) duration to delay between sending signal.SIGHUP to a - random live osd - - powercycle: (false) whether to power cycle the node instead - of just the osd process. Note that this assumes that a single - osd is the only important process on the node. - - bdev_inject_crash: (0) seconds to delay while inducing a synthetic crash. - the delay lets the BlockDevice "accept" more aio operations but blocks - any flush, and then eventually crashes (losing some or all ios). If 0, - no bdev failure injection is enabled. - - bdev_inject_crash_probability: (.5) probability of doing a bdev failure - injection crash vs a normal OSD kill. - - chance_test_backfill_full: (0) chance to simulate full disks stopping - backfill - - chance_test_map_discontinuity: (0) chance to test map discontinuity - map_discontinuity_sleep_time: (40) time to wait for map trims - - ceph_objectstore_tool: (true) whether to export/import a pg while an osd is down - chance_move_pg: (1.0) chance of moving a pg if more than 1 osd is down (default 100%) - - optrack_toggle_delay: (2.0) duration to delay between toggling op tracker - enablement to all osds - - dump_ops_enable: (true) continuously dump ops on all live osds - - noscrub_toggle_delay: (2.0) duration to delay between toggling noscrub - - disable_objectstore_tool_tests: (false) disable ceph_objectstore_tool based - tests - - chance_thrash_cluster_full: .05 - - chance_thrash_pg_upmap: 1.0 - chance_thrash_pg_upmap_items: 1.0 - - example: - - tasks: - - ceph: - - thrashosds: - cluster: ceph - chance_down: 10 - op_delay: 3 - min_in: 1 - timeout: 600 - - interactive: - """ - if config is None: - config = {} - assert isinstance(config, dict), \ - 'thrashosds task only accepts a dict for configuration' - # add default value for sighup_delay - config['sighup_delay'] = config.get('sighup_delay', 0.1) - # add default value for optrack_toggle_delay - config['optrack_toggle_delay'] = config.get('optrack_toggle_delay', 2.0) - # add default value for dump_ops_enable - config['dump_ops_enable'] = config.get('dump_ops_enable', "true") - # add default value for noscrub_toggle_delay - config['noscrub_toggle_delay'] = config.get('noscrub_toggle_delay', 2.0) - # add default value for random_eio - config['random_eio'] = config.get('random_eio', 0.0) - - log.info("config is {config}".format(config=str(config))) - - overrides = ctx.config.get('overrides', {}) - log.info("overrides is {overrides}".format(overrides=str(overrides))) - teuthology.deep_merge(config, overrides.get('thrashosds', {})) - cluster = config.get('cluster', 'ceph') - - log.info("config is {config}".format(config=str(config))) - - if 'powercycle' in config: - - # sync everyone first to avoid collateral damage to / etc. - log.info('Doing preliminary sync to avoid collateral damage...') - ctx.cluster.run(args=['sync']) - - if 'ipmi_user' in ctx.teuthology_config: - for remote in ctx.cluster.remotes.keys(): - log.debug('checking console status of %s' % remote.shortname) - if not remote.console.check_status(): - log.warn('Failed to get console status for %s', - remote.shortname) - - # check that all osd remotes have a valid console - osds = ctx.cluster.only(teuthology.is_type('osd', cluster)) - for remote in osds.remotes.keys(): - if not remote.console.has_ipmi_credentials: - raise Exception( - 'IPMI console required for powercycling, ' - 'but not available on osd role: {r}'.format( - r=remote.name)) - - cluster_manager = ctx.managers[cluster] - for f in ['powercycle', 'bdev_inject_crash']: - if config.get(f): - cluster_manager.config[f] = config.get(f) - - log.info('Beginning thrashosds...') - thrash_proc = ceph_manager.Thrasher( - cluster_manager, - config, - logger=log.getChild('thrasher') - ) - try: - yield - finally: - log.info('joining thrashosds') - thrash_proc.do_join() - cluster_manager.wait_for_all_osds_up() - cluster_manager.flush_all_pg_stats() - cluster_manager.wait_for_recovery(config.get('timeout', 360))