Add the rt linux 4.1.3-rt3 as base
[kvmfornfv.git] / kernel / drivers / block / drbd / drbd_worker.c
diff --git a/kernel/drivers/block/drbd/drbd_worker.c b/kernel/drivers/block/drbd/drbd_worker.c
new file mode 100644 (file)
index 0000000..d0fae55
--- /dev/null
@@ -0,0 +1,2156 @@
+/*
+   drbd_worker.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+
+#include <linux/module.h>
+#include <linux/drbd.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/string.h>
+#include <linux/scatterlist.h>
+
+#include "drbd_int.h"
+#include "drbd_protocol.h"
+#include "drbd_req.h"
+
+static int make_ov_request(struct drbd_device *, int);
+static int make_resync_request(struct drbd_device *, int);
+
+/* endio handlers:
+ *   drbd_md_endio (defined here)
+ *   drbd_request_endio (defined here)
+ *   drbd_peer_request_endio (defined here)
+ *   drbd_bm_endio (defined in drbd_bitmap.c)
+ *
+ * For all these callbacks, note the following:
+ * The callbacks will be called in irq context by the IDE drivers,
+ * and in Softirqs/Tasklets/BH context by the SCSI drivers.
+ * Try to get the locking right :)
+ *
+ */
+
+
+/* About the global_state_lock
+   Each state transition on an device holds a read lock. In case we have
+   to evaluate the resync after dependencies, we grab a write lock, because
+   we need stable states on all devices for that.  */
+rwlock_t global_state_lock;
+
+/* used for synchronous meta data and bitmap IO
+ * submitted by drbd_md_sync_page_io()
+ */
+void drbd_md_endio(struct bio *bio, int error)
+{
+       struct drbd_device *device;
+
+       device = bio->bi_private;
+       device->md_io.error = error;
+
+       /* We grabbed an extra reference in _drbd_md_sync_page_io() to be able
+        * to timeout on the lower level device, and eventually detach from it.
+        * If this io completion runs after that timeout expired, this
+        * drbd_md_put_buffer() may allow us to finally try and re-attach.
+        * During normal operation, this only puts that extra reference
+        * down to 1 again.
+        * Make sure we first drop the reference, and only then signal
+        * completion, or we may (in drbd_al_read_log()) cycle so fast into the
+        * next drbd_md_sync_page_io(), that we trigger the
+        * ASSERT(atomic_read(&device->md_io_in_use) == 1) there.
+        */
+       drbd_md_put_buffer(device);
+       device->md_io.done = 1;
+       wake_up(&device->misc_wait);
+       bio_put(bio);
+       if (device->ldev) /* special case: drbd_md_read() during drbd_adm_attach() */
+               put_ldev(device);
+}
+
+/* reads on behalf of the partner,
+ * "submitted" by the receiver
+ */
+static void drbd_endio_read_sec_final(struct drbd_peer_request *peer_req) __releases(local)
+{
+       unsigned long flags = 0;
+       struct drbd_peer_device *peer_device = peer_req->peer_device;
+       struct drbd_device *device = peer_device->device;
+
+       spin_lock_irqsave(&device->resource->req_lock, flags);
+       device->read_cnt += peer_req->i.size >> 9;
+       list_del(&peer_req->w.list);
+       if (list_empty(&device->read_ee))
+               wake_up(&device->ee_wait);
+       if (test_bit(__EE_WAS_ERROR, &peer_req->flags))
+               __drbd_chk_io_error(device, DRBD_READ_ERROR);
+       spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+       drbd_queue_work(&peer_device->connection->sender_work, &peer_req->w);
+       put_ldev(device);
+}
+
+/* writes on behalf of the partner, or resync writes,
+ * "submitted" by the receiver, final stage.  */
+void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local)
+{
+       unsigned long flags = 0;
+       struct drbd_peer_device *peer_device = peer_req->peer_device;
+       struct drbd_device *device = peer_device->device;
+       struct drbd_interval i;
+       int do_wake;
+       u64 block_id;
+       int do_al_complete_io;
+
+       /* after we moved peer_req to done_ee,
+        * we may no longer access it,
+        * it may be freed/reused already!
+        * (as soon as we release the req_lock) */
+       i = peer_req->i;
+       do_al_complete_io = peer_req->flags & EE_CALL_AL_COMPLETE_IO;
+       block_id = peer_req->block_id;
+       peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
+
+       spin_lock_irqsave(&device->resource->req_lock, flags);
+       device->writ_cnt += peer_req->i.size >> 9;
+       list_move_tail(&peer_req->w.list, &device->done_ee);
+
+       /*
+        * Do not remove from the write_requests tree here: we did not send the
+        * Ack yet and did not wake possibly waiting conflicting requests.
+        * Removed from the tree from "drbd_process_done_ee" within the
+        * appropriate dw.cb (e_end_block/e_end_resync_block) or from
+        * _drbd_clear_done_ee.
+        */
+
+       do_wake = list_empty(block_id == ID_SYNCER ? &device->sync_ee : &device->active_ee);
+
+       /* FIXME do we want to detach for failed REQ_DISCARD?
+        * ((peer_req->flags & (EE_WAS_ERROR|EE_IS_TRIM)) == EE_WAS_ERROR) */
+       if (peer_req->flags & EE_WAS_ERROR)
+               __drbd_chk_io_error(device, DRBD_WRITE_ERROR);
+       spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+       if (block_id == ID_SYNCER)
+               drbd_rs_complete_io(device, i.sector);
+
+       if (do_wake)
+               wake_up(&device->ee_wait);
+
+       if (do_al_complete_io)
+               drbd_al_complete_io(device, &i);
+
+       wake_asender(peer_device->connection);
+       put_ldev(device);
+}
+
+/* writes on behalf of the partner, or resync writes,
+ * "submitted" by the receiver.
+ */
+void drbd_peer_request_endio(struct bio *bio, int error)
+{
+       struct drbd_peer_request *peer_req = bio->bi_private;
+       struct drbd_device *device = peer_req->peer_device->device;
+       int uptodate = bio_flagged(bio, BIO_UPTODATE);
+       int is_write = bio_data_dir(bio) == WRITE;
+       int is_discard = !!(bio->bi_rw & REQ_DISCARD);
+
+       if (error && __ratelimit(&drbd_ratelimit_state))
+               drbd_warn(device, "%s: error=%d s=%llus\n",
+                               is_write ? (is_discard ? "discard" : "write")
+                                       : "read", error,
+                               (unsigned long long)peer_req->i.sector);
+       if (!error && !uptodate) {
+               if (__ratelimit(&drbd_ratelimit_state))
+                       drbd_warn(device, "%s: setting error to -EIO s=%llus\n",
+                                       is_write ? "write" : "read",
+                                       (unsigned long long)peer_req->i.sector);
+               /* strange behavior of some lower level drivers...
+                * fail the request by clearing the uptodate flag,
+                * but do not return any error?! */
+               error = -EIO;
+       }
+
+       if (error)
+               set_bit(__EE_WAS_ERROR, &peer_req->flags);
+
+       bio_put(bio); /* no need for the bio anymore */
+       if (atomic_dec_and_test(&peer_req->pending_bios)) {
+               if (is_write)
+                       drbd_endio_write_sec_final(peer_req);
+               else
+                       drbd_endio_read_sec_final(peer_req);
+       }
+}
+
+/* read, readA or write requests on R_PRIMARY coming from drbd_make_request
+ */
+void drbd_request_endio(struct bio *bio, int error)
+{
+       unsigned long flags;
+       struct drbd_request *req = bio->bi_private;
+       struct drbd_device *device = req->device;
+       struct bio_and_error m;
+       enum drbd_req_event what;
+       int uptodate = bio_flagged(bio, BIO_UPTODATE);
+
+       if (!error && !uptodate) {
+               drbd_warn(device, "p %s: setting error to -EIO\n",
+                        bio_data_dir(bio) == WRITE ? "write" : "read");
+               /* strange behavior of some lower level drivers...
+                * fail the request by clearing the uptodate flag,
+                * but do not return any error?! */
+               error = -EIO;
+       }
+
+
+       /* If this request was aborted locally before,
+        * but now was completed "successfully",
+        * chances are that this caused arbitrary data corruption.
+        *
+        * "aborting" requests, or force-detaching the disk, is intended for
+        * completely blocked/hung local backing devices which do no longer
+        * complete requests at all, not even do error completions.  In this
+        * situation, usually a hard-reset and failover is the only way out.
+        *
+        * By "aborting", basically faking a local error-completion,
+        * we allow for a more graceful swichover by cleanly migrating services.
+        * Still the affected node has to be rebooted "soon".
+        *
+        * By completing these requests, we allow the upper layers to re-use
+        * the associated data pages.
+        *
+        * If later the local backing device "recovers", and now DMAs some data
+        * from disk into the original request pages, in the best case it will
+        * just put random data into unused pages; but typically it will corrupt
+        * meanwhile completely unrelated data, causing all sorts of damage.
+        *
+        * Which means delayed successful completion,
+        * especially for READ requests,
+        * is a reason to panic().
+        *
+        * We assume that a delayed *error* completion is OK,
+        * though we still will complain noisily about it.
+        */
+       if (unlikely(req->rq_state & RQ_LOCAL_ABORTED)) {
+               if (__ratelimit(&drbd_ratelimit_state))
+                       drbd_emerg(device, "delayed completion of aborted local request; disk-timeout may be too aggressive\n");
+
+               if (!error)
+                       panic("possible random memory corruption caused by delayed completion of aborted local request\n");
+       }
+
+       /* to avoid recursion in __req_mod */
+       if (unlikely(error)) {
+               if (bio->bi_rw & REQ_DISCARD)
+                       what = (error == -EOPNOTSUPP)
+                               ? DISCARD_COMPLETED_NOTSUPP
+                               : DISCARD_COMPLETED_WITH_ERROR;
+               else
+                       what = (bio_data_dir(bio) == WRITE)
+                       ? WRITE_COMPLETED_WITH_ERROR
+                       : (bio_rw(bio) == READ)
+                         ? READ_COMPLETED_WITH_ERROR
+                         : READ_AHEAD_COMPLETED_WITH_ERROR;
+       } else
+               what = COMPLETED_OK;
+
+       bio_put(req->private_bio);
+       req->private_bio = ERR_PTR(error);
+
+       /* not req_mod(), we need irqsave here! */
+       spin_lock_irqsave(&device->resource->req_lock, flags);
+       __req_mod(req, what, &m);
+       spin_unlock_irqrestore(&device->resource->req_lock, flags);
+       put_ldev(device);
+
+       if (m.bio)
+               complete_master_bio(device, &m);
+}
+
+void drbd_csum_ee(struct crypto_hash *tfm, struct drbd_peer_request *peer_req, void *digest)
+{
+       struct hash_desc desc;
+       struct scatterlist sg;
+       struct page *page = peer_req->pages;
+       struct page *tmp;
+       unsigned len;
+
+       desc.tfm = tfm;
+       desc.flags = 0;
+
+       sg_init_table(&sg, 1);
+       crypto_hash_init(&desc);
+
+       while ((tmp = page_chain_next(page))) {
+               /* all but the last page will be fully used */
+               sg_set_page(&sg, page, PAGE_SIZE, 0);
+               crypto_hash_update(&desc, &sg, sg.length);
+               page = tmp;
+       }
+       /* and now the last, possibly only partially used page */
+       len = peer_req->i.size & (PAGE_SIZE - 1);
+       sg_set_page(&sg, page, len ?: PAGE_SIZE, 0);
+       crypto_hash_update(&desc, &sg, sg.length);
+       crypto_hash_final(&desc, digest);
+}
+
+void drbd_csum_bio(struct crypto_hash *tfm, struct bio *bio, void *digest)
+{
+       struct hash_desc desc;
+       struct scatterlist sg;
+       struct bio_vec bvec;
+       struct bvec_iter iter;
+
+       desc.tfm = tfm;
+       desc.flags = 0;
+
+       sg_init_table(&sg, 1);
+       crypto_hash_init(&desc);
+
+       bio_for_each_segment(bvec, bio, iter) {
+               sg_set_page(&sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset);
+               crypto_hash_update(&desc, &sg, sg.length);
+       }
+       crypto_hash_final(&desc, digest);
+}
+
+/* MAYBE merge common code with w_e_end_ov_req */
+static int w_e_send_csum(struct drbd_work *w, int cancel)
+{
+       struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+       struct drbd_peer_device *peer_device = peer_req->peer_device;
+       struct drbd_device *device = peer_device->device;
+       int digest_size;
+       void *digest;
+       int err = 0;
+
+       if (unlikely(cancel))
+               goto out;
+
+       if (unlikely((peer_req->flags & EE_WAS_ERROR) != 0))
+               goto out;
+
+       digest_size = crypto_hash_digestsize(peer_device->connection->csums_tfm);
+       digest = kmalloc(digest_size, GFP_NOIO);
+       if (digest) {
+               sector_t sector = peer_req->i.sector;
+               unsigned int size = peer_req->i.size;
+               drbd_csum_ee(peer_device->connection->csums_tfm, peer_req, digest);
+               /* Free peer_req and pages before send.
+                * In case we block on congestion, we could otherwise run into
+                * some distributed deadlock, if the other side blocks on
+                * congestion as well, because our receiver blocks in
+                * drbd_alloc_pages due to pp_in_use > max_buffers. */
+               drbd_free_peer_req(device, peer_req);
+               peer_req = NULL;
+               inc_rs_pending(device);
+               err = drbd_send_drequest_csum(peer_device, sector, size,
+                                             digest, digest_size,
+                                             P_CSUM_RS_REQUEST);
+               kfree(digest);
+       } else {
+               drbd_err(device, "kmalloc() of digest failed.\n");
+               err = -ENOMEM;
+       }
+
+out:
+       if (peer_req)
+               drbd_free_peer_req(device, peer_req);
+
+       if (unlikely(err))
+               drbd_err(device, "drbd_send_drequest(..., csum) failed\n");
+       return err;
+}
+
+#define GFP_TRY        (__GFP_HIGHMEM | __GFP_NOWARN)
+
+static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector, int size)
+{
+       struct drbd_device *device = peer_device->device;
+       struct drbd_peer_request *peer_req;
+
+       if (!get_ldev(device))
+               return -EIO;
+
+       /* GFP_TRY, because if there is no memory available right now, this may
+        * be rescheduled for later. It is "only" background resync, after all. */
+       peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector,
+                                      size, true /* has real payload */, GFP_TRY);
+       if (!peer_req)
+               goto defer;
+
+       peer_req->w.cb = w_e_send_csum;
+       spin_lock_irq(&device->resource->req_lock);
+       list_add_tail(&peer_req->w.list, &device->read_ee);
+       spin_unlock_irq(&device->resource->req_lock);
+
+       atomic_add(size >> 9, &device->rs_sect_ev);
+       if (drbd_submit_peer_request(device, peer_req, READ, DRBD_FAULT_RS_RD) == 0)
+               return 0;
+
+       /* If it failed because of ENOMEM, retry should help.  If it failed
+        * because bio_add_page failed (probably broken lower level driver),
+        * retry may or may not help.
+        * If it does not, you may need to force disconnect. */
+       spin_lock_irq(&device->resource->req_lock);
+       list_del(&peer_req->w.list);
+       spin_unlock_irq(&device->resource->req_lock);
+
+       drbd_free_peer_req(device, peer_req);
+defer:
+       put_ldev(device);
+       return -EAGAIN;
+}
+
+int w_resync_timer(struct drbd_work *w, int cancel)
+{
+       struct drbd_device *device =
+               container_of(w, struct drbd_device, resync_work);
+
+       switch (device->state.conn) {
+       case C_VERIFY_S:
+               make_ov_request(device, cancel);
+               break;
+       case C_SYNC_TARGET:
+               make_resync_request(device, cancel);
+               break;
+       }
+
+       return 0;
+}
+
+void resync_timer_fn(unsigned long data)
+{
+       struct drbd_device *device = (struct drbd_device *) data;
+
+       drbd_queue_work_if_unqueued(
+               &first_peer_device(device)->connection->sender_work,
+               &device->resync_work);
+}
+
+static void fifo_set(struct fifo_buffer *fb, int value)
+{
+       int i;
+
+       for (i = 0; i < fb->size; i++)
+               fb->values[i] = value;
+}
+
+static int fifo_push(struct fifo_buffer *fb, int value)
+{
+       int ov;
+
+       ov = fb->values[fb->head_index];
+       fb->values[fb->head_index++] = value;
+
+       if (fb->head_index >= fb->size)
+               fb->head_index = 0;
+
+       return ov;
+}
+
+static void fifo_add_val(struct fifo_buffer *fb, int value)
+{
+       int i;
+
+       for (i = 0; i < fb->size; i++)
+               fb->values[i] += value;
+}
+
+struct fifo_buffer *fifo_alloc(int fifo_size)
+{
+       struct fifo_buffer *fb;
+
+       fb = kzalloc(sizeof(struct fifo_buffer) + sizeof(int) * fifo_size, GFP_NOIO);
+       if (!fb)
+               return NULL;
+
+       fb->head_index = 0;
+       fb->size = fifo_size;
+       fb->total = 0;
+
+       return fb;
+}
+
+static int drbd_rs_controller(struct drbd_device *device, unsigned int sect_in)
+{
+       struct disk_conf *dc;
+       unsigned int want;     /* The number of sectors we want in-flight */
+       int req_sect; /* Number of sectors to request in this turn */
+       int correction; /* Number of sectors more we need in-flight */
+       int cps; /* correction per invocation of drbd_rs_controller() */
+       int steps; /* Number of time steps to plan ahead */
+       int curr_corr;
+       int max_sect;
+       struct fifo_buffer *plan;
+
+       dc = rcu_dereference(device->ldev->disk_conf);
+       plan = rcu_dereference(device->rs_plan_s);
+
+       steps = plan->size; /* (dc->c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
+
+       if (device->rs_in_flight + sect_in == 0) { /* At start of resync */
+               want = ((dc->resync_rate * 2 * SLEEP_TIME) / HZ) * steps;
+       } else { /* normal path */
+               want = dc->c_fill_target ? dc->c_fill_target :
+                       sect_in * dc->c_delay_target * HZ / (SLEEP_TIME * 10);
+       }
+
+       correction = want - device->rs_in_flight - plan->total;
+
+       /* Plan ahead */
+       cps = correction / steps;
+       fifo_add_val(plan, cps);
+       plan->total += cps * steps;
+
+       /* What we do in this step */
+       curr_corr = fifo_push(plan, 0);
+       plan->total -= curr_corr;
+
+       req_sect = sect_in + curr_corr;
+       if (req_sect < 0)
+               req_sect = 0;
+
+       max_sect = (dc->c_max_rate * 2 * SLEEP_TIME) / HZ;
+       if (req_sect > max_sect)
+               req_sect = max_sect;
+
+       /*
+       drbd_warn(device, "si=%u if=%d wa=%u co=%d st=%d cps=%d pl=%d cc=%d rs=%d\n",
+                sect_in, device->rs_in_flight, want, correction,
+                steps, cps, device->rs_planed, curr_corr, req_sect);
+       */
+
+       return req_sect;
+}
+
+static int drbd_rs_number_requests(struct drbd_device *device)
+{
+       unsigned int sect_in;  /* Number of sectors that came in since the last turn */
+       int number, mxb;
+
+       sect_in = atomic_xchg(&device->rs_sect_in, 0);
+       device->rs_in_flight -= sect_in;
+
+       rcu_read_lock();
+       mxb = drbd_get_max_buffers(device) / 2;
+       if (rcu_dereference(device->rs_plan_s)->size) {
+               number = drbd_rs_controller(device, sect_in) >> (BM_BLOCK_SHIFT - 9);
+               device->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
+       } else {
+               device->c_sync_rate = rcu_dereference(device->ldev->disk_conf)->resync_rate;
+               number = SLEEP_TIME * device->c_sync_rate  / ((BM_BLOCK_SIZE / 1024) * HZ);
+       }
+       rcu_read_unlock();
+
+       /* Don't have more than "max-buffers"/2 in-flight.
+        * Otherwise we may cause the remote site to stall on drbd_alloc_pages(),
+        * potentially causing a distributed deadlock on congestion during
+        * online-verify or (checksum-based) resync, if max-buffers,
+        * socket buffer sizes and resync rate settings are mis-configured. */
+
+       /* note that "number" is in units of "BM_BLOCK_SIZE" (which is 4k),
+        * mxb (as used here, and in drbd_alloc_pages on the peer) is
+        * "number of pages" (typically also 4k),
+        * but "rs_in_flight" is in "sectors" (512 Byte). */
+       if (mxb - device->rs_in_flight/8 < number)
+               number = mxb - device->rs_in_flight/8;
+
+       return number;
+}
+
+static int make_resync_request(struct drbd_device *const device, int cancel)
+{
+       struct drbd_peer_device *const peer_device = first_peer_device(device);
+       struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
+       unsigned long bit;
+       sector_t sector;
+       const sector_t capacity = drbd_get_capacity(device->this_bdev);
+       int max_bio_size;
+       int number, rollback_i, size;
+       int align, requeue = 0;
+       int i = 0;
+
+       if (unlikely(cancel))
+               return 0;
+
+       if (device->rs_total == 0) {
+               /* empty resync? */
+               drbd_resync_finished(device);
+               return 0;
+       }
+
+       if (!get_ldev(device)) {
+               /* Since we only need to access device->rsync a
+                  get_ldev_if_state(device,D_FAILED) would be sufficient, but
+                  to continue resync with a broken disk makes no sense at
+                  all */
+               drbd_err(device, "Disk broke down during resync!\n");
+               return 0;
+       }
+
+       max_bio_size = queue_max_hw_sectors(device->rq_queue) << 9;
+       number = drbd_rs_number_requests(device);
+       if (number <= 0)
+               goto requeue;
+
+       for (i = 0; i < number; i++) {
+               /* Stop generating RS requests when half of the send buffer is filled,
+                * but notify TCP that we'd like to have more space. */
+               mutex_lock(&connection->data.mutex);
+               if (connection->data.socket) {
+                       struct sock *sk = connection->data.socket->sk;
+                       int queued = sk->sk_wmem_queued;
+                       int sndbuf = sk->sk_sndbuf;
+                       if (queued > sndbuf / 2) {
+                               requeue = 1;
+                               if (sk->sk_socket)
+                                       set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+                       }
+               } else
+                       requeue = 1;
+               mutex_unlock(&connection->data.mutex);
+               if (requeue)
+                       goto requeue;
+
+next_sector:
+               size = BM_BLOCK_SIZE;
+               bit  = drbd_bm_find_next(device, device->bm_resync_fo);
+
+               if (bit == DRBD_END_OF_BITMAP) {
+                       device->bm_resync_fo = drbd_bm_bits(device);
+                       put_ldev(device);
+                       return 0;
+               }
+
+               sector = BM_BIT_TO_SECT(bit);
+
+               if (drbd_try_rs_begin_io(device, sector)) {
+                       device->bm_resync_fo = bit;
+                       goto requeue;
+               }
+               device->bm_resync_fo = bit + 1;
+
+               if (unlikely(drbd_bm_test_bit(device, bit) == 0)) {
+                       drbd_rs_complete_io(device, sector);
+                       goto next_sector;
+               }
+
+#if DRBD_MAX_BIO_SIZE > BM_BLOCK_SIZE
+               /* try to find some adjacent bits.
+                * we stop if we have already the maximum req size.
+                *
+                * Additionally always align bigger requests, in order to
+                * be prepared for all stripe sizes of software RAIDs.
+                */
+               align = 1;
+               rollback_i = i;
+               while (i < number) {
+                       if (size + BM_BLOCK_SIZE > max_bio_size)
+                               break;
+
+                       /* Be always aligned */
+                       if (sector & ((1<<(align+3))-1))
+                               break;
+
+                       /* do not cross extent boundaries */
+                       if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0)
+                               break;
+                       /* now, is it actually dirty, after all?
+                        * caution, drbd_bm_test_bit is tri-state for some
+                        * obscure reason; ( b == 0 ) would get the out-of-band
+                        * only accidentally right because of the "oddly sized"
+                        * adjustment below */
+                       if (drbd_bm_test_bit(device, bit+1) != 1)
+                               break;
+                       bit++;
+                       size += BM_BLOCK_SIZE;
+                       if ((BM_BLOCK_SIZE << align) <= size)
+                               align++;
+                       i++;
+               }
+               /* if we merged some,
+                * reset the offset to start the next drbd_bm_find_next from */
+               if (size > BM_BLOCK_SIZE)
+                       device->bm_resync_fo = bit + 1;
+#endif
+
+               /* adjust very last sectors, in case we are oddly sized */
+               if (sector + (size>>9) > capacity)
+                       size = (capacity-sector)<<9;
+
+               if (device->use_csums) {
+                       switch (read_for_csum(peer_device, sector, size)) {
+                       case -EIO: /* Disk failure */
+                               put_ldev(device);
+                               return -EIO;
+                       case -EAGAIN: /* allocation failed, or ldev busy */
+                               drbd_rs_complete_io(device, sector);
+                               device->bm_resync_fo = BM_SECT_TO_BIT(sector);
+                               i = rollback_i;
+                               goto requeue;
+                       case 0:
+                               /* everything ok */
+                               break;
+                       default:
+                               BUG();
+                       }
+               } else {
+                       int err;
+
+                       inc_rs_pending(device);
+                       err = drbd_send_drequest(peer_device, P_RS_DATA_REQUEST,
+                                                sector, size, ID_SYNCER);
+                       if (err) {
+                               drbd_err(device, "drbd_send_drequest() failed, aborting...\n");
+                               dec_rs_pending(device);
+                               put_ldev(device);
+                               return err;
+                       }
+               }
+       }
+
+       if (device->bm_resync_fo >= drbd_bm_bits(device)) {
+               /* last syncer _request_ was sent,
+                * but the P_RS_DATA_REPLY not yet received.  sync will end (and
+                * next sync group will resume), as soon as we receive the last
+                * resync data block, and the last bit is cleared.
+                * until then resync "work" is "inactive" ...
+                */
+               put_ldev(device);
+               return 0;
+       }
+
+ requeue:
+       device->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
+       mod_timer(&device->resync_timer, jiffies + SLEEP_TIME);
+       put_ldev(device);
+       return 0;
+}
+
+static int make_ov_request(struct drbd_device *device, int cancel)
+{
+       int number, i, size;
+       sector_t sector;
+       const sector_t capacity = drbd_get_capacity(device->this_bdev);
+       bool stop_sector_reached = false;
+
+       if (unlikely(cancel))
+               return 1;
+
+       number = drbd_rs_number_requests(device);
+
+       sector = device->ov_position;
+       for (i = 0; i < number; i++) {
+               if (sector >= capacity)
+                       return 1;
+
+               /* We check for "finished" only in the reply path:
+                * w_e_end_ov_reply().
+                * We need to send at least one request out. */
+               stop_sector_reached = i > 0
+                       && verify_can_do_stop_sector(device)
+                       && sector >= device->ov_stop_sector;
+               if (stop_sector_reached)
+                       break;
+
+               size = BM_BLOCK_SIZE;
+
+               if (drbd_try_rs_begin_io(device, sector)) {
+                       device->ov_position = sector;
+                       goto requeue;
+               }
+
+               if (sector + (size>>9) > capacity)
+                       size = (capacity-sector)<<9;
+
+               inc_rs_pending(device);
+               if (drbd_send_ov_request(first_peer_device(device), sector, size)) {
+                       dec_rs_pending(device);
+                       return 0;
+               }
+               sector += BM_SECT_PER_BIT;
+       }
+       device->ov_position = sector;
+
+ requeue:
+       device->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
+       if (i == 0 || !stop_sector_reached)
+               mod_timer(&device->resync_timer, jiffies + SLEEP_TIME);
+       return 1;
+}
+
+int w_ov_finished(struct drbd_work *w, int cancel)
+{
+       struct drbd_device_work *dw =
+               container_of(w, struct drbd_device_work, w);
+       struct drbd_device *device = dw->device;
+       kfree(dw);
+       ov_out_of_sync_print(device);
+       drbd_resync_finished(device);
+
+       return 0;
+}
+
+static int w_resync_finished(struct drbd_work *w, int cancel)
+{
+       struct drbd_device_work *dw =
+               container_of(w, struct drbd_device_work, w);
+       struct drbd_device *device = dw->device;
+       kfree(dw);
+
+       drbd_resync_finished(device);
+
+       return 0;
+}
+
+static void ping_peer(struct drbd_device *device)
+{
+       struct drbd_connection *connection = first_peer_device(device)->connection;
+
+       clear_bit(GOT_PING_ACK, &connection->flags);
+       request_ping(connection);
+       wait_event(connection->ping_wait,
+                  test_bit(GOT_PING_ACK, &connection->flags) || device->state.conn < C_CONNECTED);
+}
+
+int drbd_resync_finished(struct drbd_device *device)
+{
+       unsigned long db, dt, dbdt;
+       unsigned long n_oos;
+       union drbd_state os, ns;
+       struct drbd_device_work *dw;
+       char *khelper_cmd = NULL;
+       int verify_done = 0;
+
+       /* Remove all elements from the resync LRU. Since future actions
+        * might set bits in the (main) bitmap, then the entries in the
+        * resync LRU would be wrong. */
+       if (drbd_rs_del_all(device)) {
+               /* In case this is not possible now, most probably because
+                * there are P_RS_DATA_REPLY Packets lingering on the worker's
+                * queue (or even the read operations for those packets
+                * is not finished by now).   Retry in 100ms. */
+
+               schedule_timeout_interruptible(HZ / 10);
+               dw = kmalloc(sizeof(struct drbd_device_work), GFP_ATOMIC);
+               if (dw) {
+                       dw->w.cb = w_resync_finished;
+                       dw->device = device;
+                       drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+                                       &dw->w);
+                       return 1;
+               }
+               drbd_err(device, "Warn failed to drbd_rs_del_all() and to kmalloc(dw).\n");
+       }
+
+       dt = (jiffies - device->rs_start - device->rs_paused) / HZ;
+       if (dt <= 0)
+               dt = 1;
+
+       db = device->rs_total;
+       /* adjust for verify start and stop sectors, respective reached position */
+       if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
+               db -= device->ov_left;
+
+       dbdt = Bit2KB(db/dt);
+       device->rs_paused /= HZ;
+
+       if (!get_ldev(device))
+               goto out;
+
+       ping_peer(device);
+
+       spin_lock_irq(&device->resource->req_lock);
+       os = drbd_read_state(device);
+
+       verify_done = (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T);
+
+       /* This protects us against multiple calls (that can happen in the presence
+          of application IO), and against connectivity loss just before we arrive here. */
+       if (os.conn <= C_CONNECTED)
+               goto out_unlock;
+
+       ns = os;
+       ns.conn = C_CONNECTED;
+
+       drbd_info(device, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
+            verify_done ? "Online verify" : "Resync",
+            dt + device->rs_paused, device->rs_paused, dbdt);
+
+       n_oos = drbd_bm_total_weight(device);
+
+       if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) {
+               if (n_oos) {
+                       drbd_alert(device, "Online verify found %lu %dk block out of sync!\n",
+                             n_oos, Bit2KB(1));
+                       khelper_cmd = "out-of-sync";
+               }
+       } else {
+               D_ASSERT(device, (n_oos - device->rs_failed) == 0);
+
+               if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
+                       khelper_cmd = "after-resync-target";
+
+               if (device->use_csums && device->rs_total) {
+                       const unsigned long s = device->rs_same_csum;
+                       const unsigned long t = device->rs_total;
+                       const int ratio =
+                               (t == 0)     ? 0 :
+                       (t < 100000) ? ((s*100)/t) : (s/(t/100));
+                       drbd_info(device, "%u %% had equal checksums, eliminated: %luK; "
+                            "transferred %luK total %luK\n",
+                            ratio,
+                            Bit2KB(device->rs_same_csum),
+                            Bit2KB(device->rs_total - device->rs_same_csum),
+                            Bit2KB(device->rs_total));
+               }
+       }
+
+       if (device->rs_failed) {
+               drbd_info(device, "            %lu failed blocks\n", device->rs_failed);
+
+               if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
+                       ns.disk = D_INCONSISTENT;
+                       ns.pdsk = D_UP_TO_DATE;
+               } else {
+                       ns.disk = D_UP_TO_DATE;
+                       ns.pdsk = D_INCONSISTENT;
+               }
+       } else {
+               ns.disk = D_UP_TO_DATE;
+               ns.pdsk = D_UP_TO_DATE;
+
+               if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
+                       if (device->p_uuid) {
+                               int i;
+                               for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++)
+                                       _drbd_uuid_set(device, i, device->p_uuid[i]);
+                               drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_CURRENT]);
+                               _drbd_uuid_set(device, UI_CURRENT, device->p_uuid[UI_CURRENT]);
+                       } else {
+                               drbd_err(device, "device->p_uuid is NULL! BUG\n");
+                       }
+               }
+
+               if (!(os.conn == C_VERIFY_S || os.conn == C_VERIFY_T)) {
+                       /* for verify runs, we don't update uuids here,
+                        * so there would be nothing to report. */
+                       drbd_uuid_set_bm(device, 0UL);
+                       drbd_print_uuids(device, "updated UUIDs");
+                       if (device->p_uuid) {
+                               /* Now the two UUID sets are equal, update what we
+                                * know of the peer. */
+                               int i;
+                               for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++)
+                                       device->p_uuid[i] = device->ldev->md.uuid[i];
+                       }
+               }
+       }
+
+       _drbd_set_state(device, ns, CS_VERBOSE, NULL);
+out_unlock:
+       spin_unlock_irq(&device->resource->req_lock);
+       put_ldev(device);
+out:
+       device->rs_total  = 0;
+       device->rs_failed = 0;
+       device->rs_paused = 0;
+
+       /* reset start sector, if we reached end of device */
+       if (verify_done && device->ov_left == 0)
+               device->ov_start_sector = 0;
+
+       drbd_md_sync(device);
+
+       if (khelper_cmd)
+               drbd_khelper(device, khelper_cmd);
+
+       return 1;
+}
+
+/* helper */
+static void move_to_net_ee_or_free(struct drbd_device *device, struct drbd_peer_request *peer_req)
+{
+       if (drbd_peer_req_has_active_page(peer_req)) {
+               /* This might happen if sendpage() has not finished */
+               int i = (peer_req->i.size + PAGE_SIZE -1) >> PAGE_SHIFT;
+               atomic_add(i, &device->pp_in_use_by_net);
+               atomic_sub(i, &device->pp_in_use);
+               spin_lock_irq(&device->resource->req_lock);
+               list_add_tail(&peer_req->w.list, &device->net_ee);
+               spin_unlock_irq(&device->resource->req_lock);
+               wake_up(&drbd_pp_wait);
+       } else
+               drbd_free_peer_req(device, peer_req);
+}
+
+/**
+ * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
+ * @device:    DRBD device.
+ * @w:         work object.
+ * @cancel:    The connection will be closed anyways
+ */
+int w_e_end_data_req(struct drbd_work *w, int cancel)
+{
+       struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+       struct drbd_peer_device *peer_device = peer_req->peer_device;
+       struct drbd_device *device = peer_device->device;
+       int err;
+
+       if (unlikely(cancel)) {
+               drbd_free_peer_req(device, peer_req);
+               dec_unacked(device);
+               return 0;
+       }
+
+       if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+               err = drbd_send_block(peer_device, P_DATA_REPLY, peer_req);
+       } else {
+               if (__ratelimit(&drbd_ratelimit_state))
+                       drbd_err(device, "Sending NegDReply. sector=%llus.\n",
+                           (unsigned long long)peer_req->i.sector);
+
+               err = drbd_send_ack(peer_device, P_NEG_DREPLY, peer_req);
+       }
+
+       dec_unacked(device);
+
+       move_to_net_ee_or_free(device, peer_req);
+
+       if (unlikely(err))
+               drbd_err(device, "drbd_send_block() failed\n");
+       return err;
+}
+
+/**
+ * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUEST
+ * @w:         work object.
+ * @cancel:    The connection will be closed anyways
+ */
+int w_e_end_rsdata_req(struct drbd_work *w, int cancel)
+{
+       struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+       struct drbd_peer_device *peer_device = peer_req->peer_device;
+       struct drbd_device *device = peer_device->device;
+       int err;
+
+       if (unlikely(cancel)) {
+               drbd_free_peer_req(device, peer_req);
+               dec_unacked(device);
+               return 0;
+       }
+
+       if (get_ldev_if_state(device, D_FAILED)) {
+               drbd_rs_complete_io(device, peer_req->i.sector);
+               put_ldev(device);
+       }
+
+       if (device->state.conn == C_AHEAD) {
+               err = drbd_send_ack(peer_device, P_RS_CANCEL, peer_req);
+       } else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+               if (likely(device->state.pdsk >= D_INCONSISTENT)) {
+                       inc_rs_pending(device);
+                       err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req);
+               } else {
+                       if (__ratelimit(&drbd_ratelimit_state))
+                               drbd_err(device, "Not sending RSDataReply, "
+                                   "partner DISKLESS!\n");
+                       err = 0;
+               }
+       } else {
+               if (__ratelimit(&drbd_ratelimit_state))
+                       drbd_err(device, "Sending NegRSDReply. sector %llus.\n",
+                           (unsigned long long)peer_req->i.sector);
+
+               err = drbd_send_ack(peer_device, P_NEG_RS_DREPLY, peer_req);
+
+               /* update resync data with failure */
+               drbd_rs_failed_io(device, peer_req->i.sector, peer_req->i.size);
+       }
+
+       dec_unacked(device);
+
+       move_to_net_ee_or_free(device, peer_req);
+
+       if (unlikely(err))
+               drbd_err(device, "drbd_send_block() failed\n");
+       return err;
+}
+
+int w_e_end_csum_rs_req(struct drbd_work *w, int cancel)
+{
+       struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+       struct drbd_peer_device *peer_device = peer_req->peer_device;
+       struct drbd_device *device = peer_device->device;
+       struct digest_info *di;
+       int digest_size;
+       void *digest = NULL;
+       int err, eq = 0;
+
+       if (unlikely(cancel)) {
+               drbd_free_peer_req(device, peer_req);
+               dec_unacked(device);
+               return 0;
+       }
+
+       if (get_ldev(device)) {
+               drbd_rs_complete_io(device, peer_req->i.sector);
+               put_ldev(device);
+       }
+
+       di = peer_req->digest;
+
+       if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+               /* quick hack to try to avoid a race against reconfiguration.
+                * a real fix would be much more involved,
+                * introducing more locking mechanisms */
+               if (peer_device->connection->csums_tfm) {
+                       digest_size = crypto_hash_digestsize(peer_device->connection->csums_tfm);
+                       D_ASSERT(device, digest_size == di->digest_size);
+                       digest = kmalloc(digest_size, GFP_NOIO);
+               }
+               if (digest) {
+                       drbd_csum_ee(peer_device->connection->csums_tfm, peer_req, digest);
+                       eq = !memcmp(digest, di->digest, digest_size);
+                       kfree(digest);
+               }
+
+               if (eq) {
+                       drbd_set_in_sync(device, peer_req->i.sector, peer_req->i.size);
+                       /* rs_same_csums unit is BM_BLOCK_SIZE */
+                       device->rs_same_csum += peer_req->i.size >> BM_BLOCK_SHIFT;
+                       err = drbd_send_ack(peer_device, P_RS_IS_IN_SYNC, peer_req);
+               } else {
+                       inc_rs_pending(device);
+                       peer_req->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */
+                       peer_req->flags &= ~EE_HAS_DIGEST; /* This peer request no longer has a digest pointer */
+                       kfree(di);
+                       err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req);
+               }
+       } else {
+               err = drbd_send_ack(peer_device, P_NEG_RS_DREPLY, peer_req);
+               if (__ratelimit(&drbd_ratelimit_state))
+                       drbd_err(device, "Sending NegDReply. I guess it gets messy.\n");
+       }
+
+       dec_unacked(device);
+       move_to_net_ee_or_free(device, peer_req);
+
+       if (unlikely(err))
+               drbd_err(device, "drbd_send_block/ack() failed\n");
+       return err;
+}
+
+int w_e_end_ov_req(struct drbd_work *w, int cancel)
+{
+       struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+       struct drbd_peer_device *peer_device = peer_req->peer_device;
+       struct drbd_device *device = peer_device->device;
+       sector_t sector = peer_req->i.sector;
+       unsigned int size = peer_req->i.size;
+       int digest_size;
+       void *digest;
+       int err = 0;
+
+       if (unlikely(cancel))
+               goto out;
+
+       digest_size = crypto_hash_digestsize(peer_device->connection->verify_tfm);
+       digest = kmalloc(digest_size, GFP_NOIO);
+       if (!digest) {
+               err = 1;        /* terminate the connection in case the allocation failed */
+               goto out;
+       }
+
+       if (likely(!(peer_req->flags & EE_WAS_ERROR)))
+               drbd_csum_ee(peer_device->connection->verify_tfm, peer_req, digest);
+       else
+               memset(digest, 0, digest_size);
+
+       /* Free e and pages before send.
+        * In case we block on congestion, we could otherwise run into
+        * some distributed deadlock, if the other side blocks on
+        * congestion as well, because our receiver blocks in
+        * drbd_alloc_pages due to pp_in_use > max_buffers. */
+       drbd_free_peer_req(device, peer_req);
+       peer_req = NULL;
+       inc_rs_pending(device);
+       err = drbd_send_drequest_csum(peer_device, sector, size, digest, digest_size, P_OV_REPLY);
+       if (err)
+               dec_rs_pending(device);
+       kfree(digest);
+
+out:
+       if (peer_req)
+               drbd_free_peer_req(device, peer_req);
+       dec_unacked(device);
+       return err;
+}
+
+void drbd_ov_out_of_sync_found(struct drbd_device *device, sector_t sector, int size)
+{
+       if (device->ov_last_oos_start + device->ov_last_oos_size == sector) {
+               device->ov_last_oos_size += size>>9;
+       } else {
+               device->ov_last_oos_start = sector;
+               device->ov_last_oos_size = size>>9;
+       }
+       drbd_set_out_of_sync(device, sector, size);
+}
+
+int w_e_end_ov_reply(struct drbd_work *w, int cancel)
+{
+       struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+       struct drbd_peer_device *peer_device = peer_req->peer_device;
+       struct drbd_device *device = peer_device->device;
+       struct digest_info *di;
+       void *digest;
+       sector_t sector = peer_req->i.sector;
+       unsigned int size = peer_req->i.size;
+       int digest_size;
+       int err, eq = 0;
+       bool stop_sector_reached = false;
+
+       if (unlikely(cancel)) {
+               drbd_free_peer_req(device, peer_req);
+               dec_unacked(device);
+               return 0;
+       }
+
+       /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
+        * the resync lru has been cleaned up already */
+       if (get_ldev(device)) {
+               drbd_rs_complete_io(device, peer_req->i.sector);
+               put_ldev(device);
+       }
+
+       di = peer_req->digest;
+
+       if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+               digest_size = crypto_hash_digestsize(peer_device->connection->verify_tfm);
+               digest = kmalloc(digest_size, GFP_NOIO);
+               if (digest) {
+                       drbd_csum_ee(peer_device->connection->verify_tfm, peer_req, digest);
+
+                       D_ASSERT(device, digest_size == di->digest_size);
+                       eq = !memcmp(digest, di->digest, digest_size);
+                       kfree(digest);
+               }
+       }
+
+       /* Free peer_req and pages before send.
+        * In case we block on congestion, we could otherwise run into
+        * some distributed deadlock, if the other side blocks on
+        * congestion as well, because our receiver blocks in
+        * drbd_alloc_pages due to pp_in_use > max_buffers. */
+       drbd_free_peer_req(device, peer_req);
+       if (!eq)
+               drbd_ov_out_of_sync_found(device, sector, size);
+       else
+               ov_out_of_sync_print(device);
+
+       err = drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size,
+                              eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
+
+       dec_unacked(device);
+
+       --device->ov_left;
+
+       /* let's advance progress step marks only for every other megabyte */
+       if ((device->ov_left & 0x200) == 0x200)
+               drbd_advance_rs_marks(device, device->ov_left);
+
+       stop_sector_reached = verify_can_do_stop_sector(device) &&
+               (sector + (size>>9)) >= device->ov_stop_sector;
+
+       if (device->ov_left == 0 || stop_sector_reached) {
+               ov_out_of_sync_print(device);
+               drbd_resync_finished(device);
+       }
+
+       return err;
+}
+
+/* FIXME
+ * We need to track the number of pending barrier acks,
+ * and to be able to wait for them.
+ * See also comment in drbd_adm_attach before drbd_suspend_io.
+ */
+static int drbd_send_barrier(struct drbd_connection *connection)
+{
+       struct p_barrier *p;
+       struct drbd_socket *sock;
+
+       sock = &connection->data;
+       p = conn_prepare_command(connection, sock);
+       if (!p)
+               return -EIO;
+       p->barrier = connection->send.current_epoch_nr;
+       p->pad = 0;
+       connection->send.current_epoch_writes = 0;
+
+       return conn_send_command(connection, sock, P_BARRIER, sizeof(*p), NULL, 0);
+}
+
+int w_send_write_hint(struct drbd_work *w, int cancel)
+{
+       struct drbd_device *device =
+               container_of(w, struct drbd_device, unplug_work);
+       struct drbd_socket *sock;
+
+       if (cancel)
+               return 0;
+       sock = &first_peer_device(device)->connection->data;
+       if (!drbd_prepare_command(first_peer_device(device), sock))
+               return -EIO;
+       return drbd_send_command(first_peer_device(device), sock, P_UNPLUG_REMOTE, 0, NULL, 0);
+}
+
+static void re_init_if_first_write(struct drbd_connection *connection, unsigned int epoch)
+{
+       if (!connection->send.seen_any_write_yet) {
+               connection->send.seen_any_write_yet = true;
+               connection->send.current_epoch_nr = epoch;
+               connection->send.current_epoch_writes = 0;
+       }
+}
+
+static void maybe_send_barrier(struct drbd_connection *connection, unsigned int epoch)
+{
+       /* re-init if first write on this connection */
+       if (!connection->send.seen_any_write_yet)
+               return;
+       if (connection->send.current_epoch_nr != epoch) {
+               if (connection->send.current_epoch_writes)
+                       drbd_send_barrier(connection);
+               connection->send.current_epoch_nr = epoch;
+       }
+}
+
+int w_send_out_of_sync(struct drbd_work *w, int cancel)
+{
+       struct drbd_request *req = container_of(w, struct drbd_request, w);
+       struct drbd_device *device = req->device;
+       struct drbd_peer_device *const peer_device = first_peer_device(device);
+       struct drbd_connection *const connection = peer_device->connection;
+       int err;
+
+       if (unlikely(cancel)) {
+               req_mod(req, SEND_CANCELED);
+               return 0;
+       }
+       req->pre_send_jif = jiffies;
+
+       /* this time, no connection->send.current_epoch_writes++;
+        * If it was sent, it was the closing barrier for the last
+        * replicated epoch, before we went into AHEAD mode.
+        * No more barriers will be sent, until we leave AHEAD mode again. */
+       maybe_send_barrier(connection, req->epoch);
+
+       err = drbd_send_out_of_sync(peer_device, req);
+       req_mod(req, OOS_HANDED_TO_NETWORK);
+
+       return err;
+}
+
+/**
+ * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request
+ * @w:         work object.
+ * @cancel:    The connection will be closed anyways
+ */
+int w_send_dblock(struct drbd_work *w, int cancel)
+{
+       struct drbd_request *req = container_of(w, struct drbd_request, w);
+       struct drbd_device *device = req->device;
+       struct drbd_peer_device *const peer_device = first_peer_device(device);
+       struct drbd_connection *connection = peer_device->connection;
+       int err;
+
+       if (unlikely(cancel)) {
+               req_mod(req, SEND_CANCELED);
+               return 0;
+       }
+       req->pre_send_jif = jiffies;
+
+       re_init_if_first_write(connection, req->epoch);
+       maybe_send_barrier(connection, req->epoch);
+       connection->send.current_epoch_writes++;
+
+       err = drbd_send_dblock(peer_device, req);
+       req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
+
+       return err;
+}
+
+/**
+ * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet
+ * @w:         work object.
+ * @cancel:    The connection will be closed anyways
+ */
+int w_send_read_req(struct drbd_work *w, int cancel)
+{
+       struct drbd_request *req = container_of(w, struct drbd_request, w);
+       struct drbd_device *device = req->device;
+       struct drbd_peer_device *const peer_device = first_peer_device(device);
+       struct drbd_connection *connection = peer_device->connection;
+       int err;
+
+       if (unlikely(cancel)) {
+               req_mod(req, SEND_CANCELED);
+               return 0;
+       }
+       req->pre_send_jif = jiffies;
+
+       /* Even read requests may close a write epoch,
+        * if there was any yet. */
+       maybe_send_barrier(connection, req->epoch);
+
+       err = drbd_send_drequest(peer_device, P_DATA_REQUEST, req->i.sector, req->i.size,
+                                (unsigned long)req);
+
+       req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
+
+       return err;
+}
+
+int w_restart_disk_io(struct drbd_work *w, int cancel)
+{
+       struct drbd_request *req = container_of(w, struct drbd_request, w);
+       struct drbd_device *device = req->device;
+
+       if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
+               drbd_al_begin_io(device, &req->i);
+
+       drbd_req_make_private_bio(req, req->master_bio);
+       req->private_bio->bi_bdev = device->ldev->backing_bdev;
+       generic_make_request(req->private_bio);
+
+       return 0;
+}
+
+static int _drbd_may_sync_now(struct drbd_device *device)
+{
+       struct drbd_device *odev = device;
+       int resync_after;
+
+       while (1) {
+               if (!odev->ldev || odev->state.disk == D_DISKLESS)
+                       return 1;
+               rcu_read_lock();
+               resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
+               rcu_read_unlock();
+               if (resync_after == -1)
+                       return 1;
+               odev = minor_to_device(resync_after);
+               if (!odev)
+                       return 1;
+               if ((odev->state.conn >= C_SYNC_SOURCE &&
+                    odev->state.conn <= C_PAUSED_SYNC_T) ||
+                   odev->state.aftr_isp || odev->state.peer_isp ||
+                   odev->state.user_isp)
+                       return 0;
+       }
+}
+
+/**
+ * _drbd_pause_after() - Pause resync on all devices that may not resync now
+ * @device:    DRBD device.
+ *
+ * Called from process context only (admin command and after_state_ch).
+ */
+static int _drbd_pause_after(struct drbd_device *device)
+{
+       struct drbd_device *odev;
+       int i, rv = 0;
+
+       rcu_read_lock();
+       idr_for_each_entry(&drbd_devices, odev, i) {
+               if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
+                       continue;
+               if (!_drbd_may_sync_now(odev))
+                       rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL)
+                              != SS_NOTHING_TO_DO);
+       }
+       rcu_read_unlock();
+
+       return rv;
+}
+
+/**
+ * _drbd_resume_next() - Resume resync on all devices that may resync now
+ * @device:    DRBD device.
+ *
+ * Called from process context only (admin command and worker).
+ */
+static int _drbd_resume_next(struct drbd_device *device)
+{
+       struct drbd_device *odev;
+       int i, rv = 0;
+
+       rcu_read_lock();
+       idr_for_each_entry(&drbd_devices, odev, i) {
+               if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
+                       continue;
+               if (odev->state.aftr_isp) {
+                       if (_drbd_may_sync_now(odev))
+                               rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0),
+                                                       CS_HARD, NULL)
+                                      != SS_NOTHING_TO_DO) ;
+               }
+       }
+       rcu_read_unlock();
+       return rv;
+}
+
+void resume_next_sg(struct drbd_device *device)
+{
+       write_lock_irq(&global_state_lock);
+       _drbd_resume_next(device);
+       write_unlock_irq(&global_state_lock);
+}
+
+void suspend_other_sg(struct drbd_device *device)
+{
+       write_lock_irq(&global_state_lock);
+       _drbd_pause_after(device);
+       write_unlock_irq(&global_state_lock);
+}
+
+/* caller must hold global_state_lock */
+enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor)
+{
+       struct drbd_device *odev;
+       int resync_after;
+
+       if (o_minor == -1)
+               return NO_ERROR;
+       if (o_minor < -1 || o_minor > MINORMASK)
+               return ERR_RESYNC_AFTER;
+
+       /* check for loops */
+       odev = minor_to_device(o_minor);
+       while (1) {
+               if (odev == device)
+                       return ERR_RESYNC_AFTER_CYCLE;
+
+               /* You are free to depend on diskless, non-existing,
+                * or not yet/no longer existing minors.
+                * We only reject dependency loops.
+                * We cannot follow the dependency chain beyond a detached or
+                * missing minor.
+                */
+               if (!odev || !odev->ldev || odev->state.disk == D_DISKLESS)
+                       return NO_ERROR;
+
+               rcu_read_lock();
+               resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
+               rcu_read_unlock();
+               /* dependency chain ends here, no cycles. */
+               if (resync_after == -1)
+                       return NO_ERROR;
+
+               /* follow the dependency chain */
+               odev = minor_to_device(resync_after);
+       }
+}
+
+/* caller must hold global_state_lock */
+void drbd_resync_after_changed(struct drbd_device *device)
+{
+       int changes;
+
+       do {
+               changes  = _drbd_pause_after(device);
+               changes |= _drbd_resume_next(device);
+       } while (changes);
+}
+
+void drbd_rs_controller_reset(struct drbd_device *device)
+{
+       struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk;
+       struct fifo_buffer *plan;
+
+       atomic_set(&device->rs_sect_in, 0);
+       atomic_set(&device->rs_sect_ev, 0);
+       device->rs_in_flight = 0;
+       device->rs_last_events =
+               (int)part_stat_read(&disk->part0, sectors[0]) +
+               (int)part_stat_read(&disk->part0, sectors[1]);
+
+       /* Updating the RCU protected object in place is necessary since
+          this function gets called from atomic context.
+          It is valid since all other updates also lead to an completely
+          empty fifo */
+       rcu_read_lock();
+       plan = rcu_dereference(device->rs_plan_s);
+       plan->total = 0;
+       fifo_set(plan, 0);
+       rcu_read_unlock();
+}
+
+void start_resync_timer_fn(unsigned long data)
+{
+       struct drbd_device *device = (struct drbd_device *) data;
+       drbd_device_post_work(device, RS_START);
+}
+
+static void do_start_resync(struct drbd_device *device)
+{
+       if (atomic_read(&device->unacked_cnt) || atomic_read(&device->rs_pending_cnt)) {
+               drbd_warn(device, "postponing start_resync ...\n");
+               device->start_resync_timer.expires = jiffies + HZ/10;
+               add_timer(&device->start_resync_timer);
+               return;
+       }
+
+       drbd_start_resync(device, C_SYNC_SOURCE);
+       clear_bit(AHEAD_TO_SYNC_SOURCE, &device->flags);
+}
+
+static bool use_checksum_based_resync(struct drbd_connection *connection, struct drbd_device *device)
+{
+       bool csums_after_crash_only;
+       rcu_read_lock();
+       csums_after_crash_only = rcu_dereference(connection->net_conf)->csums_after_crash_only;
+       rcu_read_unlock();
+       return connection->agreed_pro_version >= 89 &&          /* supported? */
+               connection->csums_tfm &&                        /* configured? */
+               (csums_after_crash_only == 0                    /* use for each resync? */
+                || test_bit(CRASHED_PRIMARY, &device->flags)); /* or only after Primary crash? */
+}
+
+/**
+ * drbd_start_resync() - Start the resync process
+ * @device:    DRBD device.
+ * @side:      Either C_SYNC_SOURCE or C_SYNC_TARGET
+ *
+ * This function might bring you directly into one of the
+ * C_PAUSED_SYNC_* states.
+ */
+void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
+{
+       struct drbd_peer_device *peer_device = first_peer_device(device);
+       struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+       union drbd_state ns;
+       int r;
+
+       if (device->state.conn >= C_SYNC_SOURCE && device->state.conn < C_AHEAD) {
+               drbd_err(device, "Resync already running!\n");
+               return;
+       }
+
+       if (!test_bit(B_RS_H_DONE, &device->flags)) {
+               if (side == C_SYNC_TARGET) {
+                       /* Since application IO was locked out during C_WF_BITMAP_T and
+                          C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
+                          we check that we might make the data inconsistent. */
+                       r = drbd_khelper(device, "before-resync-target");
+                       r = (r >> 8) & 0xff;
+                       if (r > 0) {
+                               drbd_info(device, "before-resync-target handler returned %d, "
+                                        "dropping connection.\n", r);
+                               conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+                               return;
+                       }
+               } else /* C_SYNC_SOURCE */ {
+                       r = drbd_khelper(device, "before-resync-source");
+                       r = (r >> 8) & 0xff;
+                       if (r > 0) {
+                               if (r == 3) {
+                                       drbd_info(device, "before-resync-source handler returned %d, "
+                                                "ignoring. Old userland tools?", r);
+                               } else {
+                                       drbd_info(device, "before-resync-source handler returned %d, "
+                                                "dropping connection.\n", r);
+                                       conn_request_state(connection,
+                                                          NS(conn, C_DISCONNECTING), CS_HARD);
+                                       return;
+                               }
+                       }
+               }
+       }
+
+       if (current == connection->worker.task) {
+               /* The worker should not sleep waiting for state_mutex,
+                  that can take long */
+               if (!mutex_trylock(device->state_mutex)) {
+                       set_bit(B_RS_H_DONE, &device->flags);
+                       device->start_resync_timer.expires = jiffies + HZ/5;
+                       add_timer(&device->start_resync_timer);
+                       return;
+               }
+       } else {
+               mutex_lock(device->state_mutex);
+       }
+       clear_bit(B_RS_H_DONE, &device->flags);
+
+       /* req_lock: serialize with drbd_send_and_submit() and others
+        * global_state_lock: for stable sync-after dependencies */
+       spin_lock_irq(&device->resource->req_lock);
+       write_lock(&global_state_lock);
+       /* Did some connection breakage or IO error race with us? */
+       if (device->state.conn < C_CONNECTED
+       || !get_ldev_if_state(device, D_NEGOTIATING)) {
+               write_unlock(&global_state_lock);
+               spin_unlock_irq(&device->resource->req_lock);
+               mutex_unlock(device->state_mutex);
+               return;
+       }
+
+       ns = drbd_read_state(device);
+
+       ns.aftr_isp = !_drbd_may_sync_now(device);
+
+       ns.conn = side;
+
+       if (side == C_SYNC_TARGET)
+               ns.disk = D_INCONSISTENT;
+       else /* side == C_SYNC_SOURCE */
+               ns.pdsk = D_INCONSISTENT;
+
+       r = __drbd_set_state(device, ns, CS_VERBOSE, NULL);
+       ns = drbd_read_state(device);
+
+       if (ns.conn < C_CONNECTED)
+               r = SS_UNKNOWN_ERROR;
+
+       if (r == SS_SUCCESS) {
+               unsigned long tw = drbd_bm_total_weight(device);
+               unsigned long now = jiffies;
+               int i;
+
+               device->rs_failed    = 0;
+               device->rs_paused    = 0;
+               device->rs_same_csum = 0;
+               device->rs_last_sect_ev = 0;
+               device->rs_total     = tw;
+               device->rs_start     = now;
+               for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+                       device->rs_mark_left[i] = tw;
+                       device->rs_mark_time[i] = now;
+               }
+               _drbd_pause_after(device);
+               /* Forget potentially stale cached per resync extent bit-counts.
+                * Open coded drbd_rs_cancel_all(device), we already have IRQs
+                * disabled, and know the disk state is ok. */
+               spin_lock(&device->al_lock);
+               lc_reset(device->resync);
+               device->resync_locked = 0;
+               device->resync_wenr = LC_FREE;
+               spin_unlock(&device->al_lock);
+       }
+       write_unlock(&global_state_lock);
+       spin_unlock_irq(&device->resource->req_lock);
+
+       if (r == SS_SUCCESS) {
+               wake_up(&device->al_wait); /* for lc_reset() above */
+               /* reset rs_last_bcast when a resync or verify is started,
+                * to deal with potential jiffies wrap. */
+               device->rs_last_bcast = jiffies - HZ;
+
+               drbd_info(device, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
+                    drbd_conn_str(ns.conn),
+                    (unsigned long) device->rs_total << (BM_BLOCK_SHIFT-10),
+                    (unsigned long) device->rs_total);
+               if (side == C_SYNC_TARGET) {
+                       device->bm_resync_fo = 0;
+                       device->use_csums = use_checksum_based_resync(connection, device);
+               } else {
+                       device->use_csums = 0;
+               }
+
+               /* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid
+                * with w_send_oos, or the sync target will get confused as to
+                * how much bits to resync.  We cannot do that always, because for an
+                * empty resync and protocol < 95, we need to do it here, as we call
+                * drbd_resync_finished from here in that case.
+                * We drbd_gen_and_send_sync_uuid here for protocol < 96,
+                * and from after_state_ch otherwise. */
+               if (side == C_SYNC_SOURCE && connection->agreed_pro_version < 96)
+                       drbd_gen_and_send_sync_uuid(peer_device);
+
+               if (connection->agreed_pro_version < 95 && device->rs_total == 0) {
+                       /* This still has a race (about when exactly the peers
+                        * detect connection loss) that can lead to a full sync
+                        * on next handshake. In 8.3.9 we fixed this with explicit
+                        * resync-finished notifications, but the fix
+                        * introduces a protocol change.  Sleeping for some
+                        * time longer than the ping interval + timeout on the
+                        * SyncSource, to give the SyncTarget the chance to
+                        * detect connection loss, then waiting for a ping
+                        * response (implicit in drbd_resync_finished) reduces
+                        * the race considerably, but does not solve it. */
+                       if (side == C_SYNC_SOURCE) {
+                               struct net_conf *nc;
+                               int timeo;
+
+                               rcu_read_lock();
+                               nc = rcu_dereference(connection->net_conf);
+                               timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9;
+                               rcu_read_unlock();
+                               schedule_timeout_interruptible(timeo);
+                       }
+                       drbd_resync_finished(device);
+               }
+
+               drbd_rs_controller_reset(device);
+               /* ns.conn may already be != device->state.conn,
+                * we may have been paused in between, or become paused until
+                * the timer triggers.
+                * No matter, that is handled in resync_timer_fn() */
+               if (ns.conn == C_SYNC_TARGET)
+                       mod_timer(&device->resync_timer, jiffies);
+
+               drbd_md_sync(device);
+       }
+       put_ldev(device);
+       mutex_unlock(device->state_mutex);
+}
+
+static void update_on_disk_bitmap(struct drbd_device *device, bool resync_done)
+{
+       struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, };
+       device->rs_last_bcast = jiffies;
+
+       if (!get_ldev(device))
+               return;
+
+       drbd_bm_write_lazy(device, 0);
+       if (resync_done && is_sync_state(device->state.conn))
+               drbd_resync_finished(device);
+
+       drbd_bcast_event(device, &sib);
+       /* update timestamp, in case it took a while to write out stuff */
+       device->rs_last_bcast = jiffies;
+       put_ldev(device);
+}
+
+static void drbd_ldev_destroy(struct drbd_device *device)
+{
+       lc_destroy(device->resync);
+       device->resync = NULL;
+       lc_destroy(device->act_log);
+       device->act_log = NULL;
+
+       __acquire(local);
+       drbd_free_ldev(device->ldev);
+       device->ldev = NULL;
+       __release(local);
+
+       clear_bit(GOING_DISKLESS, &device->flags);
+       wake_up(&device->misc_wait);
+}
+
+static void go_diskless(struct drbd_device *device)
+{
+       D_ASSERT(device, device->state.disk == D_FAILED);
+       /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
+        * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
+        * the protected members anymore, though, so once put_ldev reaches zero
+        * again, it will be safe to free them. */
+
+       /* Try to write changed bitmap pages, read errors may have just
+        * set some bits outside the area covered by the activity log.
+        *
+        * If we have an IO error during the bitmap writeout,
+        * we will want a full sync next time, just in case.
+        * (Do we want a specific meta data flag for this?)
+        *
+        * If that does not make it to stable storage either,
+        * we cannot do anything about that anymore.
+        *
+        * We still need to check if both bitmap and ldev are present, we may
+        * end up here after a failed attach, before ldev was even assigned.
+        */
+       if (device->bitmap && device->ldev) {
+               /* An interrupted resync or similar is allowed to recounts bits
+                * while we detach.
+                * Any modifications would not be expected anymore, though.
+                */
+               if (drbd_bitmap_io_from_worker(device, drbd_bm_write,
+                                       "detach", BM_LOCKED_TEST_ALLOWED)) {
+                       if (test_bit(WAS_READ_ERROR, &device->flags)) {
+                               drbd_md_set_flag(device, MDF_FULL_SYNC);
+                               drbd_md_sync(device);
+                       }
+               }
+       }
+
+       drbd_force_state(device, NS(disk, D_DISKLESS));
+}
+
+static int do_md_sync(struct drbd_device *device)
+{
+       drbd_warn(device, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
+       drbd_md_sync(device);
+       return 0;
+}
+
+/* only called from drbd_worker thread, no locking */
+void __update_timing_details(
+               struct drbd_thread_timing_details *tdp,
+               unsigned int *cb_nr,
+               void *cb,
+               const char *fn, const unsigned int line)
+{
+       unsigned int i = *cb_nr % DRBD_THREAD_DETAILS_HIST;
+       struct drbd_thread_timing_details *td = tdp + i;
+
+       td->start_jif = jiffies;
+       td->cb_addr = cb;
+       td->caller_fn = fn;
+       td->line = line;
+       td->cb_nr = *cb_nr;
+
+       i = (i+1) % DRBD_THREAD_DETAILS_HIST;
+       td = tdp + i;
+       memset(td, 0, sizeof(*td));
+
+       ++(*cb_nr);
+}
+
+static void do_device_work(struct drbd_device *device, const unsigned long todo)
+{
+       if (test_bit(MD_SYNC, &todo))
+               do_md_sync(device);
+       if (test_bit(RS_DONE, &todo) ||
+           test_bit(RS_PROGRESS, &todo))
+               update_on_disk_bitmap(device, test_bit(RS_DONE, &todo));
+       if (test_bit(GO_DISKLESS, &todo))
+               go_diskless(device);
+       if (test_bit(DESTROY_DISK, &todo))
+               drbd_ldev_destroy(device);
+       if (test_bit(RS_START, &todo))
+               do_start_resync(device);
+}
+
+#define DRBD_DEVICE_WORK_MASK  \
+       ((1UL << GO_DISKLESS)   \
+       |(1UL << DESTROY_DISK)  \
+       |(1UL << MD_SYNC)       \
+       |(1UL << RS_START)      \
+       |(1UL << RS_PROGRESS)   \
+       |(1UL << RS_DONE)       \
+       )
+
+static unsigned long get_work_bits(unsigned long *flags)
+{
+       unsigned long old, new;
+       do {
+               old = *flags;
+               new = old & ~DRBD_DEVICE_WORK_MASK;
+       } while (cmpxchg(flags, old, new) != old);
+       return old & DRBD_DEVICE_WORK_MASK;
+}
+
+static void do_unqueued_work(struct drbd_connection *connection)
+{
+       struct drbd_peer_device *peer_device;
+       int vnr;
+
+       rcu_read_lock();
+       idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+               struct drbd_device *device = peer_device->device;
+               unsigned long todo = get_work_bits(&device->flags);
+               if (!todo)
+                       continue;
+
+               kref_get(&device->kref);
+               rcu_read_unlock();
+               do_device_work(device, todo);
+               kref_put(&device->kref, drbd_destroy_device);
+               rcu_read_lock();
+       }
+       rcu_read_unlock();
+}
+
+static bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list)
+{
+       spin_lock_irq(&queue->q_lock);
+       list_splice_tail_init(&queue->q, work_list);
+       spin_unlock_irq(&queue->q_lock);
+       return !list_empty(work_list);
+}
+
+static void wait_for_work(struct drbd_connection *connection, struct list_head *work_list)
+{
+       DEFINE_WAIT(wait);
+       struct net_conf *nc;
+       int uncork, cork;
+
+       dequeue_work_batch(&connection->sender_work, work_list);
+       if (!list_empty(work_list))
+               return;
+
+       /* Still nothing to do?
+        * Maybe we still need to close the current epoch,
+        * even if no new requests are queued yet.
+        *
+        * Also, poke TCP, just in case.
+        * Then wait for new work (or signal). */
+       rcu_read_lock();
+       nc = rcu_dereference(connection->net_conf);
+       uncork = nc ? nc->tcp_cork : 0;
+       rcu_read_unlock();
+       if (uncork) {
+               mutex_lock(&connection->data.mutex);
+               if (connection->data.socket)
+                       drbd_tcp_uncork(connection->data.socket);
+               mutex_unlock(&connection->data.mutex);
+       }
+
+       for (;;) {
+               int send_barrier;
+               prepare_to_wait(&connection->sender_work.q_wait, &wait, TASK_INTERRUPTIBLE);
+               spin_lock_irq(&connection->resource->req_lock);
+               spin_lock(&connection->sender_work.q_lock);     /* FIXME get rid of this one? */
+               if (!list_empty(&connection->sender_work.q))
+                       list_splice_tail_init(&connection->sender_work.q, work_list);
+               spin_unlock(&connection->sender_work.q_lock);   /* FIXME get rid of this one? */
+               if (!list_empty(work_list) || signal_pending(current)) {
+                       spin_unlock_irq(&connection->resource->req_lock);
+                       break;
+               }
+
+               /* We found nothing new to do, no to-be-communicated request,
+                * no other work item.  We may still need to close the last
+                * epoch.  Next incoming request epoch will be connection ->
+                * current transfer log epoch number.  If that is different
+                * from the epoch of the last request we communicated, it is
+                * safe to send the epoch separating barrier now.
+                */
+               send_barrier =
+                       atomic_read(&connection->current_tle_nr) !=
+                       connection->send.current_epoch_nr;
+               spin_unlock_irq(&connection->resource->req_lock);
+
+               if (send_barrier)
+                       maybe_send_barrier(connection,
+                                       connection->send.current_epoch_nr + 1);
+
+               if (test_bit(DEVICE_WORK_PENDING, &connection->flags))
+                       break;
+
+               /* drbd_send() may have called flush_signals() */
+               if (get_t_state(&connection->worker) != RUNNING)
+                       break;
+
+               schedule();
+               /* may be woken up for other things but new work, too,
+                * e.g. if the current epoch got closed.
+                * In which case we send the barrier above. */
+       }
+       finish_wait(&connection->sender_work.q_wait, &wait);
+
+       /* someone may have changed the config while we have been waiting above. */
+       rcu_read_lock();
+       nc = rcu_dereference(connection->net_conf);
+       cork = nc ? nc->tcp_cork : 0;
+       rcu_read_unlock();
+       mutex_lock(&connection->data.mutex);
+       if (connection->data.socket) {
+               if (cork)
+                       drbd_tcp_cork(connection->data.socket);
+               else if (!uncork)
+                       drbd_tcp_uncork(connection->data.socket);
+       }
+       mutex_unlock(&connection->data.mutex);
+}
+
+int drbd_worker(struct drbd_thread *thi)
+{
+       struct drbd_connection *connection = thi->connection;
+       struct drbd_work *w = NULL;
+       struct drbd_peer_device *peer_device;
+       LIST_HEAD(work_list);
+       int vnr;
+
+       while (get_t_state(thi) == RUNNING) {
+               drbd_thread_current_set_cpu(thi);
+
+               if (list_empty(&work_list)) {
+                       update_worker_timing_details(connection, wait_for_work);
+                       wait_for_work(connection, &work_list);
+               }
+
+               if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags)) {
+                       update_worker_timing_details(connection, do_unqueued_work);
+                       do_unqueued_work(connection);
+               }
+
+               if (signal_pending(current)) {
+                       flush_signals(current);
+                       if (get_t_state(thi) == RUNNING) {
+                               drbd_warn(connection, "Worker got an unexpected signal\n");
+                               continue;
+                       }
+                       break;
+               }
+
+               if (get_t_state(thi) != RUNNING)
+                       break;
+
+               if (!list_empty(&work_list)) {
+                       w = list_first_entry(&work_list, struct drbd_work, list);
+                       list_del_init(&w->list);
+                       update_worker_timing_details(connection, w->cb);
+                       if (w->cb(w, connection->cstate < C_WF_REPORT_PARAMS) == 0)
+                               continue;
+                       if (connection->cstate >= C_WF_REPORT_PARAMS)
+                               conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
+               }
+       }
+
+       do {
+               if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags)) {
+                       update_worker_timing_details(connection, do_unqueued_work);
+                       do_unqueued_work(connection);
+               }
+               if (!list_empty(&work_list)) {
+                       w = list_first_entry(&work_list, struct drbd_work, list);
+                       list_del_init(&w->list);
+                       update_worker_timing_details(connection, w->cb);
+                       w->cb(w, 1);
+               } else
+                       dequeue_work_batch(&connection->sender_work, &work_list);
+       } while (!list_empty(&work_list) || test_bit(DEVICE_WORK_PENDING, &connection->flags));
+
+       rcu_read_lock();
+       idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+               struct drbd_device *device = peer_device->device;
+               D_ASSERT(device, device->state.disk == D_DISKLESS && device->state.conn == C_STANDALONE);
+               kref_get(&device->kref);
+               rcu_read_unlock();
+               drbd_device_cleanup(device);
+               kref_put(&device->kref, drbd_destroy_device);
+               rcu_read_lock();
+       }
+       rcu_read_unlock();
+
+       return 0;
+}