1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #include "librbd/io/ImageRequestWQ.h"
5 #include "common/errno.h"
6 #include "common/zipkin_trace.h"
7 #include "librbd/ExclusiveLock.h"
8 #include "librbd/ImageCtx.h"
9 #include "librbd/ImageState.h"
10 #include "librbd/internal.h"
11 #include "librbd/Utils.h"
12 #include "librbd/exclusive_lock/Policy.h"
13 #include "librbd/io/AioCompletion.h"
14 #include "librbd/io/ImageRequest.h"
16 #define dout_subsys ceph_subsys_rbd
18 #define dout_prefix *_dout << "librbd::io::ImageRequestWQ: " << this \
19 << " " << __func__ << ": "
25 struct ImageRequestWQ<I>::C_AcquireLock : public Context {
26 ImageRequestWQ *work_queue;
27 ImageRequest<I> *image_request;
29 C_AcquireLock(ImageRequestWQ *work_queue, ImageRequest<I> *image_request)
30 : work_queue(work_queue), image_request(image_request) {
33 void finish(int r) override {
34 work_queue->handle_acquire_lock(r, image_request);
39 struct ImageRequestWQ<I>::C_BlockedWrites : public Context {
40 ImageRequestWQ *work_queue;
41 C_BlockedWrites(ImageRequestWQ *_work_queue)
42 : work_queue(_work_queue) {
45 void finish(int r) override {
46 work_queue->handle_blocked_writes(r);
51 struct ImageRequestWQ<I>::C_RefreshFinish : public Context {
52 ImageRequestWQ *work_queue;
53 ImageRequest<I> *image_request;
55 C_RefreshFinish(ImageRequestWQ *work_queue,
56 ImageRequest<I> *image_request)
57 : work_queue(work_queue), image_request(image_request) {
59 void finish(int r) override {
60 work_queue->handle_refreshed(r, image_request);
65 ImageRequestWQ<I>::ImageRequestWQ(I *image_ctx, const string &name,
66 time_t ti, ThreadPool *tp)
67 : ThreadPool::PointerWQ<ImageRequest<I> >(name, ti, 0, tp),
68 m_image_ctx(*image_ctx),
69 m_lock(util::unique_lock_name("ImageRequestWQ<I>::m_lock", this)) {
70 CephContext *cct = m_image_ctx.cct;
71 ldout(cct, 5) << "ictx=" << image_ctx << dendl;
72 this->register_work_queue();
76 ssize_t ImageRequestWQ<I>::read(uint64_t off, uint64_t len,
77 ReadResult &&read_result, int op_flags) {
78 CephContext *cct = m_image_ctx.cct;
79 ldout(cct, 20) << "ictx=" << &m_image_ctx << ", off=" << off << ", "
80 << "len = " << len << dendl;
83 AioCompletion *c = AioCompletion::create(&cond);
84 aio_read(c, off, len, std::move(read_result), op_flags, false);
89 ssize_t ImageRequestWQ<I>::write(uint64_t off, uint64_t len,
90 bufferlist &&bl, int op_flags) {
91 CephContext *cct = m_image_ctx.cct;
92 ldout(cct, 20) << "ictx=" << &m_image_ctx << ", off=" << off << ", "
93 << "len = " << len << dendl;
95 m_image_ctx.snap_lock.get_read();
96 int r = clip_io(util::get_image_ctx(&m_image_ctx), off, &len);
97 m_image_ctx.snap_lock.put_read();
99 lderr(cct) << "invalid IO request: " << cpp_strerror(r) << dendl;
104 AioCompletion *c = AioCompletion::create(&cond);
105 aio_write(c, off, len, std::move(bl), op_flags, false);
114 template <typename I>
115 ssize_t ImageRequestWQ<I>::discard(uint64_t off, uint64_t len,
116 bool skip_partial_discard) {
117 CephContext *cct = m_image_ctx.cct;
118 ldout(cct, 20) << "ictx=" << &m_image_ctx << ", off=" << off << ", "
119 << "len = " << len << dendl;
121 m_image_ctx.snap_lock.get_read();
122 int r = clip_io(util::get_image_ctx(&m_image_ctx), off, &len);
123 m_image_ctx.snap_lock.put_read();
125 lderr(cct) << "invalid IO request: " << cpp_strerror(r) << dendl;
130 AioCompletion *c = AioCompletion::create(&cond);
131 aio_discard(c, off, len, skip_partial_discard, false);
140 template <typename I>
141 ssize_t ImageRequestWQ<I>::writesame(uint64_t off, uint64_t len,
142 bufferlist &&bl, int op_flags) {
143 CephContext *cct = m_image_ctx.cct;
144 ldout(cct, 20) << "ictx=" << &m_image_ctx << ", off=" << off << ", "
145 << "len = " << len << ", data_len " << bl.length() << dendl;
147 m_image_ctx.snap_lock.get_read();
148 int r = clip_io(util::get_image_ctx(&m_image_ctx), off, &len);
149 m_image_ctx.snap_lock.put_read();
151 lderr(cct) << "invalid IO request: " << cpp_strerror(r) << dendl;
156 AioCompletion *c = AioCompletion::create(&cond);
157 aio_writesame(c, off, len, std::move(bl), op_flags, false);
166 template <typename I>
167 ssize_t ImageRequestWQ<I>::compare_and_write(uint64_t off, uint64_t len,
170 uint64_t *mismatch_off,
172 CephContext *cct = m_image_ctx.cct;
173 ldout(cct, 20) << "compare_and_write ictx=" << &m_image_ctx << ", off="
174 << off << ", " << "len = " << len << dendl;
176 m_image_ctx.snap_lock.get_read();
177 int r = clip_io(util::get_image_ctx(&m_image_ctx), off, &len);
178 m_image_ctx.snap_lock.put_read();
180 lderr(cct) << "invalid IO request: " << cpp_strerror(r) << dendl;
185 AioCompletion *c = AioCompletion::create(&cond);
186 aio_compare_and_write(c, off, len, std::move(cmp_bl), std::move(bl),
187 mismatch_off, op_flags, false);
197 template <typename I>
198 void ImageRequestWQ<I>::aio_read(AioCompletion *c, uint64_t off, uint64_t len,
199 ReadResult &&read_result, int op_flags,
201 CephContext *cct = m_image_ctx.cct;
202 ZTracer::Trace trace;
203 if (m_image_ctx.blkin_trace_all) {
204 trace.init("wq: read", &m_image_ctx.trace_endpoint);
205 trace.event("start");
208 c->init_time(util::get_image_ctx(&m_image_ctx), AIO_TYPE_READ);
209 ldout(cct, 20) << "ictx=" << &m_image_ctx << ", "
210 << "completion=" << c << ", off=" << off << ", "
211 << "len=" << len << ", " << "flags=" << op_flags << dendl;
213 if (native_async && m_image_ctx.event_socket.is_valid()) {
214 c->set_event_notify(true);
217 if (!start_in_flight_io(c)) {
221 // if journaling is enabled -- we need to replay the journal because
222 // it might contain an uncommitted write
223 RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
224 if (m_image_ctx.non_blocking_aio || writes_blocked() || !writes_empty() ||
225 require_lock_on_read()) {
226 queue(ImageRequest<I>::create_read_request(
227 m_image_ctx, c, {{off, len}}, std::move(read_result), op_flags,
231 ImageRequest<I>::aio_read(&m_image_ctx, c, {{off, len}},
232 std::move(read_result), op_flags, trace);
233 finish_in_flight_io();
235 trace.event("finish");
238 template <typename I>
239 void ImageRequestWQ<I>::aio_write(AioCompletion *c, uint64_t off, uint64_t len,
240 bufferlist &&bl, int op_flags,
242 CephContext *cct = m_image_ctx.cct;
243 ZTracer::Trace trace;
244 if (m_image_ctx.blkin_trace_all) {
245 trace.init("wq: write", &m_image_ctx.trace_endpoint);
249 c->init_time(util::get_image_ctx(&m_image_ctx), AIO_TYPE_WRITE);
250 ldout(cct, 20) << "ictx=" << &m_image_ctx << ", "
251 << "completion=" << c << ", off=" << off << ", "
252 << "len=" << len << ", flags=" << op_flags << dendl;
254 if (native_async && m_image_ctx.event_socket.is_valid()) {
255 c->set_event_notify(true);
258 if (!start_in_flight_io(c)) {
262 RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
263 if (m_image_ctx.non_blocking_aio || writes_blocked()) {
264 queue(ImageRequest<I>::create_write_request(
265 m_image_ctx, c, {{off, len}}, std::move(bl), op_flags, trace));
268 ImageRequest<I>::aio_write(&m_image_ctx, c, {{off, len}},
269 std::move(bl), op_flags, trace);
270 finish_in_flight_io();
272 trace.event("finish");
275 template <typename I>
276 void ImageRequestWQ<I>::aio_discard(AioCompletion *c, uint64_t off,
277 uint64_t len, bool skip_partial_discard,
279 CephContext *cct = m_image_ctx.cct;
280 ZTracer::Trace trace;
281 if (m_image_ctx.blkin_trace_all) {
282 trace.init("wq: discard", &m_image_ctx.trace_endpoint);
286 c->init_time(util::get_image_ctx(&m_image_ctx), AIO_TYPE_DISCARD);
287 ldout(cct, 20) << "ictx=" << &m_image_ctx << ", "
288 << "completion=" << c << ", off=" << off << ", len=" << len
291 if (native_async && m_image_ctx.event_socket.is_valid()) {
292 c->set_event_notify(true);
295 if (!start_in_flight_io(c)) {
299 RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
300 if (m_image_ctx.non_blocking_aio || writes_blocked()) {
301 queue(ImageRequest<I>::create_discard_request(
302 m_image_ctx, c, off, len, skip_partial_discard, trace));
305 ImageRequest<I>::aio_discard(&m_image_ctx, c, off, len,
306 skip_partial_discard, trace);
307 finish_in_flight_io();
309 trace.event("finish");
312 template <typename I>
313 void ImageRequestWQ<I>::aio_flush(AioCompletion *c, bool native_async) {
314 CephContext *cct = m_image_ctx.cct;
315 ZTracer::Trace trace;
316 if (m_image_ctx.blkin_trace_all) {
317 trace.init("wq: flush", &m_image_ctx.trace_endpoint);
321 c->init_time(util::get_image_ctx(&m_image_ctx), AIO_TYPE_FLUSH);
322 ldout(cct, 20) << "ictx=" << &m_image_ctx << ", "
323 << "completion=" << c << dendl;
325 if (native_async && m_image_ctx.event_socket.is_valid()) {
326 c->set_event_notify(true);
329 if (!start_in_flight_io(c)) {
333 RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
334 if (m_image_ctx.non_blocking_aio || writes_blocked() || !writes_empty()) {
335 queue(ImageRequest<I>::create_flush_request(m_image_ctx, c, trace));
337 ImageRequest<I>::aio_flush(&m_image_ctx, c, trace);
338 finish_in_flight_io();
340 trace.event("finish");
343 template <typename I>
344 void ImageRequestWQ<I>::aio_writesame(AioCompletion *c, uint64_t off,
345 uint64_t len, bufferlist &&bl,
346 int op_flags, bool native_async) {
347 CephContext *cct = m_image_ctx.cct;
348 ZTracer::Trace trace;
349 if (m_image_ctx.blkin_trace_all) {
350 trace.init("wq: writesame", &m_image_ctx.trace_endpoint);
354 c->init_time(util::get_image_ctx(&m_image_ctx), AIO_TYPE_WRITESAME);
355 ldout(cct, 20) << "ictx=" << &m_image_ctx << ", "
356 << "completion=" << c << ", off=" << off << ", "
357 << "len=" << len << ", data_len = " << bl.length() << ", "
358 << "flags=" << op_flags << dendl;
360 if (native_async && m_image_ctx.event_socket.is_valid()) {
361 c->set_event_notify(true);
364 if (!start_in_flight_io(c)) {
368 RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
369 if (m_image_ctx.non_blocking_aio || writes_blocked()) {
370 queue(ImageRequest<I>::create_writesame_request(
371 m_image_ctx, c, off, len, std::move(bl), op_flags, trace));
374 ImageRequest<I>::aio_writesame(&m_image_ctx, c, off, len, std::move(bl),
376 finish_in_flight_io();
378 trace.event("finish");
381 template <typename I>
382 void ImageRequestWQ<I>::aio_compare_and_write(AioCompletion *c,
383 uint64_t off, uint64_t len,
386 uint64_t *mismatch_off,
387 int op_flags, bool native_async) {
388 CephContext *cct = m_image_ctx.cct;
389 ZTracer::Trace trace;
390 if (m_image_ctx.blkin_trace_all) {
391 trace.init("wq: compare_and_write", &m_image_ctx.trace_endpoint);
395 c->init_time(util::get_image_ctx(&m_image_ctx), AIO_TYPE_COMPARE_AND_WRITE);
396 ldout(cct, 20) << "ictx=" << &m_image_ctx << ", "
397 << "completion=" << c << ", off=" << off << ", "
398 << "len=" << len << dendl;
400 if (native_async && m_image_ctx.event_socket.is_valid()) {
401 c->set_event_notify(true);
404 if (!start_in_flight_io(c)) {
408 RWLock::RLocker owner_locker(m_image_ctx.owner_lock);
409 if (m_image_ctx.non_blocking_aio || writes_blocked()) {
410 queue(ImageRequest<I>::create_compare_and_write_request(
411 m_image_ctx, c, {{off, len}}, std::move(cmp_bl), std::move(bl),
412 mismatch_off, op_flags, trace));
415 ImageRequest<I>::aio_compare_and_write(&m_image_ctx, c, {{off, len}},
416 std::move(cmp_bl), std::move(bl),
417 mismatch_off, op_flags, trace);
418 finish_in_flight_io();
420 trace.event("finish");
423 template <typename I>
424 void ImageRequestWQ<I>::shut_down(Context *on_shutdown) {
425 assert(m_image_ctx.owner_lock.is_locked());
428 RWLock::WLocker locker(m_lock);
432 CephContext *cct = m_image_ctx.cct;
433 ldout(cct, 5) << __func__ << ": in_flight=" << m_in_flight_ios.load()
435 if (m_in_flight_ios > 0) {
436 m_on_shutdown = on_shutdown;
441 // ensure that all in-flight IO is flushed
442 m_image_ctx.flush(on_shutdown);
445 template <typename I>
446 int ImageRequestWQ<I>::block_writes() {
447 C_SaferCond cond_ctx;
448 block_writes(&cond_ctx);
449 return cond_ctx.wait();
452 template <typename I>
453 void ImageRequestWQ<I>::block_writes(Context *on_blocked) {
454 assert(m_image_ctx.owner_lock.is_locked());
455 CephContext *cct = m_image_ctx.cct;
458 RWLock::WLocker locker(m_lock);
460 ldout(cct, 5) << &m_image_ctx << ", " << "num="
461 << m_write_blockers << dendl;
462 if (!m_write_blocker_contexts.empty() || m_in_flight_writes > 0) {
463 m_write_blocker_contexts.push_back(on_blocked);
468 // ensure that all in-flight IO is flushed
469 m_image_ctx.flush(on_blocked);
472 template <typename I>
473 void ImageRequestWQ<I>::unblock_writes() {
474 CephContext *cct = m_image_ctx.cct;
476 bool wake_up = false;
478 RWLock::WLocker locker(m_lock);
479 assert(m_write_blockers > 0);
482 ldout(cct, 5) << &m_image_ctx << ", " << "num="
483 << m_write_blockers << dendl;
484 if (m_write_blockers == 0) {
494 template <typename I>
495 void ImageRequestWQ<I>::set_require_lock(Direction direction, bool enabled) {
496 CephContext *cct = m_image_ctx.cct;
497 ldout(cct, 20) << dendl;
499 bool wake_up = false;
501 RWLock::WLocker locker(m_lock);
504 wake_up = (enabled != m_require_lock_on_read);
505 m_require_lock_on_read = enabled;
507 case DIRECTION_WRITE:
508 wake_up = (enabled != m_require_lock_on_write);
509 m_require_lock_on_write = enabled;
512 wake_up = (enabled != m_require_lock_on_read ||
513 enabled != m_require_lock_on_write);
514 m_require_lock_on_read = enabled;
515 m_require_lock_on_write = enabled;
520 // wake up the thread pool whenever the state changes so that
521 // we can re-request the lock if required
527 template <typename I>
528 void *ImageRequestWQ<I>::_void_dequeue() {
529 CephContext *cct = m_image_ctx.cct;
530 ImageRequest<I> *peek_item = this->front();
532 // no queued IO requests or all IO is blocked/stalled
533 if (peek_item == nullptr || m_io_blockers.load() > 0) {
538 bool refresh_required = m_image_ctx.state->is_refresh_required();
540 RWLock::RLocker locker(m_lock);
541 bool write_op = peek_item->is_write_op();
542 lock_required = is_lock_required(write_op);
544 if (!lock_required && m_write_blockers > 0) {
545 // missing lock is not the write blocker
549 if (!lock_required && !refresh_required) {
550 // completed ops will requeue the IO -- don't count it as in-progress
551 m_in_flight_writes++;
556 ImageRequest<I> *item = reinterpret_cast<ImageRequest<I> *>(
557 ThreadPool::PointerWQ<ImageRequest<I> >::_void_dequeue());
558 assert(peek_item == item);
561 this->get_pool_lock().Unlock();
562 m_image_ctx.owner_lock.get_read();
563 if (m_image_ctx.exclusive_lock != nullptr) {
564 ldout(cct, 5) << "exclusive lock required: delaying IO " << item << dendl;
565 if (!m_image_ctx.get_exclusive_lock_policy()->may_auto_request_lock()) {
566 lderr(cct) << "op requires exclusive lock" << dendl;
567 fail_in_flight_io(-EROFS, item);
569 // wake up the IO since we won't be returning a request to process
572 // stall IO until the acquire completes
574 m_image_ctx.exclusive_lock->acquire_lock(new C_AcquireLock(this, item));
577 // raced with the exclusive lock being disabled
578 lock_required = false;
580 m_image_ctx.owner_lock.put_read();
581 this->get_pool_lock().Lock();
588 if (refresh_required) {
589 ldout(cct, 5) << "image refresh required: delaying IO " << item << dendl;
591 // stall IO until the refresh completes
594 this->get_pool_lock().Unlock();
595 m_image_ctx.state->refresh(new C_RefreshFinish(this, item));
596 this->get_pool_lock().Lock();
604 template <typename I>
605 void ImageRequestWQ<I>::process(ImageRequest<I> *req) {
606 CephContext *cct = m_image_ctx.cct;
607 ldout(cct, 20) << "ictx=" << &m_image_ctx << ", "
608 << "req=" << req << dendl;
612 finish_queued_io(req);
613 if (req->is_write_op()) {
614 finish_in_flight_write();
618 finish_in_flight_io();
621 template <typename I>
622 void ImageRequestWQ<I>::finish_queued_io(ImageRequest<I> *req) {
623 RWLock::RLocker locker(m_lock);
624 if (req->is_write_op()) {
625 assert(m_queued_writes > 0);
628 assert(m_queued_reads > 0);
633 template <typename I>
634 void ImageRequestWQ<I>::finish_in_flight_write() {
635 bool writes_blocked = false;
637 RWLock::RLocker locker(m_lock);
638 assert(m_in_flight_writes > 0);
639 if (--m_in_flight_writes == 0 &&
640 !m_write_blocker_contexts.empty()) {
641 writes_blocked = true;
645 if (writes_blocked) {
646 m_image_ctx.flush(new C_BlockedWrites(this));
650 template <typename I>
651 int ImageRequestWQ<I>::start_in_flight_io(AioCompletion *c) {
652 RWLock::RLocker locker(m_lock);
655 CephContext *cct = m_image_ctx.cct;
656 lderr(cct) << "IO received on closed image" << dendl;
667 template <typename I>
668 void ImageRequestWQ<I>::finish_in_flight_io() {
669 Context *on_shutdown;
671 RWLock::RLocker locker(m_lock);
672 if (--m_in_flight_ios > 0 || !m_shutdown) {
675 on_shutdown = m_on_shutdown;
678 CephContext *cct = m_image_ctx.cct;
679 ldout(cct, 5) << "completing shut down" << dendl;
681 assert(on_shutdown != nullptr);
682 m_image_ctx.flush(on_shutdown);
685 template <typename I>
686 void ImageRequestWQ<I>::fail_in_flight_io(int r, ImageRequest<I> *req) {
687 this->process_finish();
689 finish_queued_io(req);
691 finish_in_flight_io();
694 template <typename I>
695 bool ImageRequestWQ<I>::is_lock_required(bool write_op) const {
696 assert(m_lock.is_locked());
697 return ((write_op && m_require_lock_on_write) ||
698 (!write_op && m_require_lock_on_read));
701 template <typename I>
702 void ImageRequestWQ<I>::queue(ImageRequest<I> *req) {
703 assert(m_image_ctx.owner_lock.is_locked());
705 CephContext *cct = m_image_ctx.cct;
706 ldout(cct, 20) << "ictx=" << &m_image_ctx << ", "
707 << "req=" << req << dendl;
709 if (req->is_write_op()) {
715 ThreadPool::PointerWQ<ImageRequest<I> >::queue(req);
718 template <typename I>
719 void ImageRequestWQ<I>::handle_acquire_lock(int r, ImageRequest<I> *req) {
720 CephContext *cct = m_image_ctx.cct;
721 ldout(cct, 5) << "r=" << r << ", " << "req=" << req << dendl;
724 fail_in_flight_io(r, req);
726 // since IO was stalled for acquire -- original IO order is preserved
727 // if we requeue this op for work queue processing
731 assert(m_io_blockers.load() > 0);
736 template <typename I>
737 void ImageRequestWQ<I>::handle_refreshed(int r, ImageRequest<I> *req) {
738 CephContext *cct = m_image_ctx.cct;
739 ldout(cct, 5) << "resuming IO after image refresh: r=" << r << ", "
740 << "req=" << req << dendl;
742 fail_in_flight_io(r, req);
744 // since IO was stalled for refresh -- original IO order is preserved
745 // if we requeue this op for work queue processing
749 assert(m_io_blockers.load() > 0);
754 template <typename I>
755 void ImageRequestWQ<I>::handle_blocked_writes(int r) {
758 RWLock::WLocker locker(m_lock);
759 contexts.swap(m_write_blocker_contexts);
762 for (auto ctx : contexts) {
767 template class librbd::io::ImageRequestWQ<librbd::ImageCtx>;
770 } // namespace librbd