src/ceph/qa/tasks/osd_failsafe_enospc.py

   1 """
   2 Handle osdfailsafe configuration settings (nearfull ratio and full ratio)
   3 """
   4 from cStringIO import StringIO
   5 import logging
   6 import time
   7
   8 from teuthology.orchestra import run
   9 from util.rados import rados
  10 from teuthology import misc as teuthology
  11
  12 log = logging.getLogger(__name__)
  13
  14 def task(ctx, config):
  15     """
  16     Test handling of osd_failsafe_nearfull_ratio and osd_failsafe_full_ratio
  17     configuration settings
  18
  19     In order for test to pass must use log-whitelist as follows
  20
  21         tasks:
  22             - chef:
  23             - install:
  24             - ceph:
  25                 log-whitelist: ['OSD near full', 'OSD full dropping all updates']
  26             - osd_failsafe_enospc:
  27
  28     """
  29     if config is None:
  30         config = {}
  31     assert isinstance(config, dict), \
  32         'osd_failsafe_enospc task only accepts a dict for configuration'
  33
  34     # Give 2 seconds for injectargs + osd_op_complaint_time (30) + 2 * osd_heartbeat_interval (6) + 6 padding
  35     sleep_time = 50
  36
  37     # something that is always there
  38     dummyfile = '/etc/fstab'
  39     dummyfile2 = '/etc/resolv.conf'
  40
  41     manager = ctx.managers['ceph']
  42
  43     # create 1 pg pool with 1 rep which can only be on osd.0
  44     osds = manager.get_osd_dump()
  45     for osd in osds:
  46         if osd['osd'] != 0:
  47             manager.mark_out_osd(osd['osd'])
  48
  49     log.info('creating pool foo')
  50     manager.create_pool("foo")
  51     manager.raw_cluster_cmd('osd', 'pool', 'set', 'foo', 'size', '1')
  52
  53     # State NONE -> NEAR
  54     log.info('1. Verify warning messages when exceeding nearfull_ratio')
  55
  56     first_mon = teuthology.get_first_mon(ctx, config)
  57     (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
  58
  59     proc = mon.run(
  60              args=[
  61                  'sudo',
  62                  'daemon-helper',
  63                  'kill',
  64                  'ceph', '-w'
  65              ],
  66              stdin=run.PIPE,
  67              stdout=StringIO(),
  68              wait=False,
  69         )
  70
  71     manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .00001')
  72
  73     time.sleep(sleep_time)
  74     proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
  75     proc.wait()
  76
  77     lines = proc.stdout.getvalue().split('\n')
  78
  79     count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
  80     assert count == 2, 'Incorrect number of warning messages expected 2 got %d' % count
  81     count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
  82     assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count
  83
  84     # State NEAR -> FULL
  85     log.info('2. Verify error messages when exceeding full_ratio')
  86
  87     proc = mon.run(
  88              args=[
  89                  'sudo',
  90                  'daemon-helper',
  91                  'kill',
  92                  'ceph', '-w'
  93              ],
  94              stdin=run.PIPE,
  95              stdout=StringIO(),
  96              wait=False,
  97         )
  98
  99     manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001')
 100
 101     time.sleep(sleep_time)
 102     proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
 103     proc.wait()
 104
 105     lines = proc.stdout.getvalue().split('\n')
 106
 107     count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
 108     assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count
 109
 110     log.info('3. Verify write failure when exceeding full_ratio')
 111
 112     # Write data should fail
 113     ret = rados(ctx, mon, ['-p', 'foo', 'put', 'newfile1', dummyfile])
 114     assert ret != 0, 'Expected write failure but it succeeded with exit status 0'
 115
 116     # Put back default
 117     manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97')
 118     time.sleep(10)
 119
 120     # State FULL -> NEAR
 121     log.info('4. Verify write success when NOT exceeding full_ratio')
 122
 123     # Write should succeed
 124     ret = rados(ctx, mon, ['-p', 'foo', 'put', 'newfile2', dummyfile2])
 125     assert ret == 0, 'Expected write to succeed, but got exit status %d' % ret
 126
 127     log.info('5. Verify warning messages again when exceeding nearfull_ratio')
 128
 129     proc = mon.run(
 130              args=[
 131                  'sudo',
 132                  'daemon-helper',
 133                  'kill',
 134                  'ceph', '-w'
 135              ],
 136              stdin=run.PIPE,
 137              stdout=StringIO(),
 138              wait=False,
 139         )
 140
 141     time.sleep(sleep_time)
 142     proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
 143     proc.wait()
 144
 145     lines = proc.stdout.getvalue().split('\n')
 146
 147     count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
 148     assert count == 1 or count == 2, 'Incorrect number of warning messages expected 1 or 2 got %d' % count
 149     count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
 150     assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count
 151
 152     manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .90')
 153     time.sleep(10)
 154
 155     # State NONE -> FULL
 156     log.info('6. Verify error messages again when exceeding full_ratio')
 157
 158     proc = mon.run(
 159              args=[
 160                  'sudo',
 161                  'daemon-helper',
 162                  'kill',
 163                  'ceph', '-w'
 164              ],
 165              stdin=run.PIPE,
 166              stdout=StringIO(),
 167              wait=False,
 168         )
 169
 170     manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001')
 171
 172     time.sleep(sleep_time)
 173     proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
 174     proc.wait()
 175
 176     lines = proc.stdout.getvalue().split('\n')
 177
 178     count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
 179     assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count
 180     count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
 181     assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count
 182
 183     # State FULL -> NONE
 184     log.info('7. Verify no messages settings back to default')
 185
 186     manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97')
 187     time.sleep(10)
 188
 189     proc = mon.run(
 190              args=[
 191                  'sudo',
 192                  'daemon-helper',
 193                  'kill',
 194                  'ceph', '-w'
 195              ],
 196              stdin=run.PIPE,
 197              stdout=StringIO(),
 198              wait=False,
 199         )
 200
 201     time.sleep(sleep_time)
 202     proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
 203     proc.wait()
 204
 205     lines = proc.stdout.getvalue().split('\n')
 206
 207     count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
 208     assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count
 209     count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
 210     assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count
 211
 212     log.info('Test Passed')
 213
 214     # Bring all OSDs back in
 215     manager.remove_pool("foo")
 216     for osd in osds:
 217         if osd['osd'] != 0:
 218             manager.mark_in_osd(osd['osd'])