2 Recovery system benchmarking
4 from cStringIO import StringIO
14 from teuthology import misc as teuthology
16 log = logging.getLogger(__name__)
18 @contextlib.contextmanager
19 def task(ctx, config):
21 Benchmark the recovery system.
23 Generates objects with smalliobench, runs it normally to get a
24 baseline performance measurement, then marks an OSD out and reruns
25 to measure performance during recovery.
27 The config should be as follows:
30 duration: <seconds for each measurement run>
31 num_objects: <number of objects>
32 io_size: <io size in bytes>
45 assert isinstance(config, dict), \
46 'recovery_bench task only accepts a dict for configuration'
48 log.info('Beginning recovery bench...')
50 first_mon = teuthology.get_first_mon(ctx, config)
51 (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
53 manager = ceph_manager.CephManager(
56 logger=log.getChild('ceph_manager'),
59 num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
60 while len(manager.get_osd_status()['up']) < num_osds:
63 bench_proc = RecoveryBencher(
70 log.info('joining recovery bencher')
73 class RecoveryBencher:
77 def __init__(self, manager, config):
78 self.ceph_manager = manager
79 self.ceph_manager.wait_for_clean()
81 osd_status = self.ceph_manager.get_osd_status()
82 self.osds = osd_status['up']
85 if self.config is None:
91 Local wrapper to print value.
96 log.info("spawning thread")
98 self.thread = gevent.spawn(self.do_bench)
102 Join the recovery bencher. This is called after the main
111 duration = self.config.get("duration", 60)
112 num_objects = self.config.get("num_objects", 500)
113 io_size = self.config.get("io_size", 4096)
115 osd = str(random.choice(self.osds))
116 (osd_remote,) = self.ceph_manager.ctx.cluster.only('osd.%s' % osd).remotes.iterkeys()
118 testdir = teuthology.get_testdir(self.ceph_manager.ctx)
125 '{tdir}/archive/coverage'.format(tdir=testdir),
126 'smalliobench'.format(tdir=testdir),
127 '--use-prefix', 'recovery_bench',
129 '--num-objects', str(num_objects),
130 '--io-size', str(io_size),
136 log.info('non-recovery (baseline)')
141 '{tdir}/archive/coverage'.format(tdir=testdir),
143 '--use-prefix', 'recovery_bench',
144 '--do-not-init', '1',
145 '--duration', str(duration),
146 '--io-size', str(io_size),
152 self.process_samples(p.stderr.getvalue())
154 self.ceph_manager.raw_cluster_cmd('osd', 'out', osd)
158 log.info('recovery active')
163 '{tdir}/archive/coverage'.format(tdir=testdir),
165 '--use-prefix', 'recovery_bench',
166 '--do-not-init', '1',
167 '--duration', str(duration),
168 '--io-size', str(io_size),
174 self.process_samples(p.stderr.getvalue())
176 self.ceph_manager.raw_cluster_cmd('osd', 'in', osd)
178 def process_samples(self, input):
180 Extract samples from the input and process the results
182 :param input: input lines in JSON format
185 for line in input.split('\n'):
187 sample = json.loads(line)
188 samples = lat.setdefault(sample['type'], [])
189 samples.append(float(sample['latency']))
200 if num & 1 == 1: # odd number of samples
201 median = samples[num / 2]
203 median = (samples[num / 2] + samples[num / 2 - 1]) / 2
206 ninety_nine = samples[int(num * 0.99)]
208 log.info("%s: median %f, 99%% %f" % (type, median, ninety_nine))