4 # Test the lost object logic
8 source "`dirname $0`/test_common.sh"
14 export CEPH_NUM_OSD=$1
20 # set recovery start to a really long time to ensure that we don't start recovery
21 ./vstart.sh -d -n -o "$vstart_config" || die "vstart failed"
23 # for exiting pools set size not greater than number of OSDs,
24 # so recovery from degraded ps is possible
26 for pool in `./ceph osd pool ls`; do
27 local size=`./ceph osd pool get ${pool} size | awk '{print $2}'`
28 if [ "${size}" -gt "${CEPH_NUM_OSD}" ]; then
29 ./ceph osd pool set ${pool} size ${CEPH_NUM_OSD}
33 if [ ${changed} -eq 1 ]; then
34 # XXX: When a pool has degraded pgs due to size greater than number
35 # of OSDs, after decreasing the size the recovery still could stuck
36 # and requires an additional kick.
41 poll_cmd "./ceph health" HEALTH_OK 1 30
45 # Write lots and lots of objects
46 write_objects 1 1 200 4000 $TEST_POOL
51 # Continue writing a lot of objects
52 write_objects 2 2 200 4000 $TEST_POOL
61 # At this point we have peered, but *NOT* recovered.
62 # Objects should be lost.
65 poll_cmd "./ceph pg debug degraded_pgs_exist" TRUE 3 120
66 [ $? -eq 1 ] || die "Failed to see degraded PGs."
67 poll_cmd "./ceph pg debug unfound_objects_exist" TRUE 3 120
68 [ $? -eq 1 ] || die "Failed to see unfound objects."
69 echo "Got unfound objects."
75 # Turn on recovery and wait for it to complete.
76 poll_cmd "./ceph pg debug unfound_objects_exist" FALSE 3 120
77 [ $? -eq 1 ] || die "Failed to recover unfound objects."
78 poll_cmd "./ceph pg debug degraded_pgs_exist" FALSE 3 120
79 [ $? -eq 1 ] || die "Recovery never finished."
83 setup 2 'osd recovery delay start = 10000'
89 local lost_action=delete
92 if is_set revert_lost $flags; then
96 # Write lots and lots of objects
97 write_objects 1 1 20 8000 $TEST_POOL
102 # Continue writing a lot of objects
103 write_objects 2 2 20 8000 $TEST_POOL
112 # At this point we have peered, but *NOT* recovered.
113 # Objects should be lost.
116 # Since recovery can't proceed, stuff should be unfound.
117 poll_cmd "./ceph pg debug unfound_objects_exist" TRUE 3 120
118 [ $? -eq 1 ] || die "Failed to see unfound objects."
120 pgs_unfound=`./ceph health detail |awk '$1 = "pg" && /[0-9] unfound$/ {print $2}'`
122 [ -n "$pgs_unfound" ] || die "no pg with unfound objects"
124 for pg in $pgs_unfound; do
125 ./ceph pg $pg mark_unfound_lost revert &&
126 die "mark_unfound_lost unexpectedly succeeded for pg $pg"
129 if ! is_set mark_osd_lost $flags && ! is_set rm_osd $flags; then
133 if is_set try_to_fetch_unfound $flags; then
134 # Ask for an object while it's still unfound, and
135 # verify we get woken to an error when it's declared lost.
136 echo "trying to get one of the unfound objects"
138 ./rados -c ./ceph.conf -p $TEST_POOL get obj02 $TEMPDIR/obj02 &&\
139 die "expected radostool error"
143 if is_set mark_osd_lost $flags; then
144 ./ceph osd lost 0 --yes-i-really-mean-it
147 if is_set rm_osd $flags; then
151 if ! is_set auto_mark_unfound_lost $flags; then
152 for pg in $pgs_unfound; do
153 ./ceph pg $pg mark_unfound_lost ${lost_action} ||
154 die "mark_unfound_lost failed for pg $pg"
160 # Unfound objects go away and are turned into lost objects.
161 poll_cmd "./ceph pg debug unfound_objects_exist" FALSE 3 120
162 [ $? -eq 1 ] || die "Unfound objects didn't go away."
164 for pg in `ceph pg ls | awk '/^[0-9]/ {print $1}'`; do
165 ./ceph pg $pg mark_unfound_lost revert 2>&1 |
166 grep 'pg has no unfound objects' ||
167 die "pg $pg has unfound objects"
170 # Reading from a lost object gives back an error code.
171 # TODO: check error code
172 ./rados -c ./ceph.conf -p $TEST_POOL get obj01 $TEMPDIR/obj01
173 if [ lost_action = delete -a $? -eq 0 ]; then
174 die "expected radostool error"
175 elif [ lost_action = revert -a $? -ne 0 ]; then
176 die "unexpected radostool error"
179 if is_set try_to_fetch_unfound $flags; then
180 echo "waiting for the try_to_fetch_unfound \
181 radostool instance to finish"
187 setup 2 'osd recovery delay start = 10000'
188 lost1_impl mark_osd_lost revert_lost
192 setup 2 'osd recovery delay start = 10000'
193 lost1_impl mark_osd_lost try_to_fetch_unfound
197 setup 2 'osd recovery delay start = 10000'
202 setup 2 'osd recovery delay start = 10000'
203 lost1_impl mark_osd_lost rm_osd
207 setup 2 'osd recovery delay start = 10000'
208 lost1_impl mark_osd_lost auto_mark_unfound_lost
211 all_osds_die_impl() {
212 poll_cmd "./ceph osd stat" '3 up, 3 in' 20 240
213 [ $? -eq 1 ] || die "didn't start 3 osds"
219 # wait for the MOSDPGStat timeout
220 poll_cmd "./ceph osd stat" '0 up' 20 240
221 [ $? -eq 1 ] || die "all osds weren't marked as down"
225 setup 3 'osd mon report interval max = 60
226 osd mon report interval min = 3
227 mon osd report timeout = 60'
233 recovery1 || die "test failed"
235 lost1 || die "test failed"
237 # XXX: try_to_fetch_unfound test currently hangs on "waiting for the
238 # try_to_fetch_unfound radostool instance to finish"
239 #lost2 || die "test failed"
241 lost3 || die "test failed"
243 lost4 || die "test failed"
245 # XXX: automatically marking lost is not implemented
246 #lost5 || die "test failed"
248 all_osds_die || die "test failed"