These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / fs / ceph / snap.c
index a97e39f..4aa7122 100644 (file)
@@ -296,7 +296,7 @@ static int cmpu64_rev(const void *a, const void *b)
 }
 
 
-static struct ceph_snap_context *empty_snapc;
+struct ceph_snap_context *ceph_empty_snapc;
 
 /*
  * build the snap context for a given realm.
@@ -338,12 +338,6 @@ static int build_snap_context(struct ceph_snap_realm *realm)
                return 0;
        }
 
-       if (num == 0 && realm->seq == empty_snapc->seq) {
-               ceph_get_snap_context(empty_snapc);
-               snapc = empty_snapc;
-               goto done;
-       }
-
        /* alloc new snap context */
        err = -ENOMEM;
        if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64))
@@ -381,7 +375,6 @@ static int build_snap_context(struct ceph_snap_realm *realm)
             realm->ino, realm, snapc, snapc->seq,
             (unsigned int) snapc->num_snaps);
 
-done:
        ceph_put_snap_context(realm->cached_context);
        realm->cached_context = snapc;
        return 0;
@@ -436,6 +429,14 @@ static int dup_array(u64 **dst, __le64 *src, u32 num)
        return 0;
 }
 
+static bool has_new_snaps(struct ceph_snap_context *o,
+                         struct ceph_snap_context *n)
+{
+       if (n->num_snaps == 0)
+               return false;
+       /* snaps are in descending order */
+       return n->snaps[0] > o->seq;
+}
 
 /*
  * When a snapshot is applied, the size/mtime inode metadata is queued
@@ -455,6 +456,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
 {
        struct inode *inode = &ci->vfs_inode;
        struct ceph_cap_snap *capsnap;
+       struct ceph_snap_context *old_snapc, *new_snapc;
        int used, dirty;
 
        capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
@@ -467,6 +469,9 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
        used = __ceph_caps_used(ci);
        dirty = __ceph_caps_dirty(ci);
 
+       old_snapc = ci->i_head_snapc;
+       new_snapc = ci->i_snap_realm->cached_context;
+
        /*
         * If there is a write in progress, treat that as a dirty Fw,
         * even though it hasn't completed yet; by the time we finish
@@ -481,76 +486,95 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
                   writes in progress now were started before the previous
                   cap_snap.  lucky us. */
                dout("queue_cap_snap %p already pending\n", inode);
-               kfree(capsnap);
-       } else if (ci->i_snap_realm->cached_context == empty_snapc) {
-               dout("queue_cap_snap %p empty snapc\n", inode);
-               kfree(capsnap);
-       } else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL|
-                           CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) {
-               struct ceph_snap_context *snapc = ci->i_head_snapc;
-
-               /*
-                * if we are a sync write, we may need to go to the snaprealm
-                * to get the current snapc.
-                */
-               if (!snapc)
-                       snapc = ci->i_snap_realm->cached_context;
+               goto update_snapc;
+       }
+       if (ci->i_wrbuffer_ref_head == 0 &&
+           !(dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))) {
+               dout("queue_cap_snap %p nothing dirty|writing\n", inode);
+               goto update_snapc;
+       }
 
-               dout("queue_cap_snap %p cap_snap %p queuing under %p %s\n",
-                    inode, capsnap, snapc, ceph_cap_string(dirty));
-               ihold(inode);
+       BUG_ON(!old_snapc);
 
-               atomic_set(&capsnap->nref, 1);
-               capsnap->ci = ci;
-               INIT_LIST_HEAD(&capsnap->ci_item);
-               INIT_LIST_HEAD(&capsnap->flushing_item);
-
-               capsnap->follows = snapc->seq;
-               capsnap->issued = __ceph_caps_issued(ci, NULL);
-               capsnap->dirty = dirty;
-
-               capsnap->mode = inode->i_mode;
-               capsnap->uid = inode->i_uid;
-               capsnap->gid = inode->i_gid;
-
-               if (dirty & CEPH_CAP_XATTR_EXCL) {
-                       __ceph_build_xattrs_blob(ci);
-                       capsnap->xattr_blob =
-                               ceph_buffer_get(ci->i_xattrs.blob);
-                       capsnap->xattr_version = ci->i_xattrs.version;
-               } else {
-                       capsnap->xattr_blob = NULL;
-                       capsnap->xattr_version = 0;
+       /*
+        * There is no need to send FLUSHSNAP message to MDS if there is
+        * no new snapshot. But when there is dirty pages or on-going
+        * writes, we still need to create cap_snap. cap_snap is needed
+        * by the write path and page writeback path.
+        *
+        * also see ceph_try_drop_cap_snap()
+        */
+       if (has_new_snaps(old_snapc, new_snapc)) {
+               if (dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))
+                       capsnap->need_flush = true;
+       } else {
+               if (!(used & CEPH_CAP_FILE_WR) &&
+                   ci->i_wrbuffer_ref_head == 0) {
+                       dout("queue_cap_snap %p "
+                            "no new_snap|dirty_page|writing\n", inode);
+                       goto update_snapc;
                }
+       }
 
-               capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
-
-               /* dirty page count moved from _head to this cap_snap;
-                  all subsequent writes page dirties occur _after_ this
-                  snapshot. */
-               capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
-               ci->i_wrbuffer_ref_head = 0;
-               capsnap->context = snapc;
-               ci->i_head_snapc =
-                       ceph_get_snap_context(ci->i_snap_realm->cached_context);
-               dout(" new snapc is %p\n", ci->i_head_snapc);
-               list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
-
-               if (used & CEPH_CAP_FILE_WR) {
-                       dout("queue_cap_snap %p cap_snap %p snapc %p"
-                            " seq %llu used WR, now pending\n", inode,
-                            capsnap, snapc, snapc->seq);
-                       capsnap->writing = 1;
-               } else {
-                       /* note mtime, size NOW. */
-                       __ceph_finish_cap_snap(ci, capsnap);
-               }
+       dout("queue_cap_snap %p cap_snap %p queuing under %p %s %s\n",
+            inode, capsnap, old_snapc, ceph_cap_string(dirty),
+            capsnap->need_flush ? "" : "no_flush");
+       ihold(inode);
+
+       atomic_set(&capsnap->nref, 1);
+       capsnap->ci = ci;
+       INIT_LIST_HEAD(&capsnap->ci_item);
+       INIT_LIST_HEAD(&capsnap->flushing_item);
+
+       capsnap->follows = old_snapc->seq;
+       capsnap->issued = __ceph_caps_issued(ci, NULL);
+       capsnap->dirty = dirty;
+
+       capsnap->mode = inode->i_mode;
+       capsnap->uid = inode->i_uid;
+       capsnap->gid = inode->i_gid;
+
+       if (dirty & CEPH_CAP_XATTR_EXCL) {
+               __ceph_build_xattrs_blob(ci);
+               capsnap->xattr_blob =
+                       ceph_buffer_get(ci->i_xattrs.blob);
+               capsnap->xattr_version = ci->i_xattrs.version;
        } else {
-               dout("queue_cap_snap %p nothing dirty|writing\n", inode);
-               kfree(capsnap);
+               capsnap->xattr_blob = NULL;
+               capsnap->xattr_version = 0;
        }
 
+       capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
+
+       /* dirty page count moved from _head to this cap_snap;
+          all subsequent writes page dirties occur _after_ this
+          snapshot. */
+       capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
+       ci->i_wrbuffer_ref_head = 0;
+       capsnap->context = old_snapc;
+       list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
+       old_snapc = NULL;
+
+       if (used & CEPH_CAP_FILE_WR) {
+               dout("queue_cap_snap %p cap_snap %p snapc %p"
+                    " seq %llu used WR, now pending\n", inode,
+                    capsnap, old_snapc, old_snapc->seq);
+               capsnap->writing = 1;
+       } else {
+               /* note mtime, size NOW. */
+               __ceph_finish_cap_snap(ci, capsnap);
+       }
+       capsnap = NULL;
+
+update_snapc:
+       if (ci->i_head_snapc) {
+               ci->i_head_snapc = ceph_get_snap_context(new_snapc);
+               dout(" new snapc is %p\n", new_snapc);
+       }
        spin_unlock(&ci->i_ceph_lock);
+
+       kfree(capsnap);
+       ceph_put_snap_context(old_snapc);
 }
 
 /*
@@ -699,6 +723,8 @@ more:
 
                /* queue realm for cap_snap creation */
                list_add(&realm->dirty_item, &dirty_realms);
+               if (realm->seq > mdsc->last_snap_seq)
+                       mdsc->last_snap_seq = realm->seq;
 
                invalidate = 1;
        } else if (!realm->cached_context) {
@@ -964,14 +990,14 @@ out:
 
 int __init ceph_snap_init(void)
 {
-       empty_snapc = ceph_create_snap_context(0, GFP_NOFS);
-       if (!empty_snapc)
+       ceph_empty_snapc = ceph_create_snap_context(0, GFP_NOFS);
+       if (!ceph_empty_snapc)
                return -ENOMEM;
-       empty_snapc->seq = 1;
+       ceph_empty_snapc->seq = 1;
        return 0;
 }
 
 void ceph_snap_exit(void)
 {
-       ceph_put_snap_context(empty_snapc);
+       ceph_put_snap_context(ceph_empty_snapc);
 }