These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / fs / ceph / inode.c
index e876e19..498dcfa 100644 (file)
@@ -6,7 +6,6 @@
 #include <linux/string.h>
 #include <linux/uaccess.h>
 #include <linux/kernel.h>
-#include <linux/namei.h>
 #include <linux/writeback.h>
 #include <linux/vmalloc.h>
 #include <linux/posix_acl.h>
@@ -390,9 +389,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
        ci->i_inline_version = 0;
        ci->i_time_warp_seq = 0;
        ci->i_ceph_flags = 0;
-       ci->i_ordered_count = 0;
-       atomic_set(&ci->i_release_count, 1);
-       atomic_set(&ci->i_complete_count, 0);
+       atomic64_set(&ci->i_ordered_count, 1);
+       atomic64_set(&ci->i_release_count, 1);
+       atomic64_set(&ci->i_complete_seq[0], 0);
+       atomic64_set(&ci->i_complete_seq[1], 0);
        ci->i_symlink = NULL;
 
        memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
@@ -416,9 +416,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
        ci->i_flushing_caps = 0;
        INIT_LIST_HEAD(&ci->i_dirty_item);
        INIT_LIST_HEAD(&ci->i_flushing_item);
-       ci->i_cap_flush_seq = 0;
-       ci->i_cap_flush_last_tid = 0;
-       memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid));
+       ci->i_prealloc_cap_flush = NULL;
+       ci->i_cap_flush_tree = RB_ROOT;
        init_waitqueue_head(&ci->i_cap_wq);
        ci->i_hold_caps_min = 0;
        ci->i_hold_caps_max = 0;
@@ -453,6 +452,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
 
        INIT_LIST_HEAD(&ci->i_unsafe_writes);
        INIT_LIST_HEAD(&ci->i_unsafe_dirops);
+       INIT_LIST_HEAD(&ci->i_unsafe_iops);
        spin_lock_init(&ci->i_unsafe_lock);
 
        ci->i_snap_realm = NULL;
@@ -753,7 +753,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 
        if (new_version ||
            (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
+               if (ci->i_layout.fl_pg_pool != info->layout.fl_pg_pool)
+                       ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
                ci->i_layout = info->layout;
+
                queue_trunc = ceph_fill_file_size(inode, issued,
                                        le32_to_cpu(info->truncate_seq),
                                        le64_to_cpu(info->truncate_size),
@@ -819,6 +822,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
                        else
                                kfree(sym); /* lost a race */
                }
+               inode->i_link = ci->i_symlink;
                break;
        case S_IFDIR:
                inode->i_op = &ceph_dir_iops;
@@ -858,9 +862,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
                            (issued & CEPH_CAP_FILE_EXCL) == 0 &&
                            !__ceph_dir_is_complete(ci)) {
                                dout(" marking %p complete (empty)\n", inode);
+                               i_size_write(inode, 0);
                                __ceph_dir_set_complete(ci,
-                                       atomic_read(&ci->i_release_count),
-                                       ci->i_ordered_count);
+                                       atomic64_read(&ci->i_release_count),
+                                       atomic64_read(&ci->i_ordered_count));
                        }
 
                        wake = true;
@@ -1212,6 +1217,10 @@ retry_lookup:
                        dout("fill_trace doing d_move %p -> %p\n",
                             req->r_old_dentry, dn);
 
+                       /* d_move screws up sibling dentries' offsets */
+                       ceph_dir_clear_ordered(dir);
+                       ceph_dir_clear_ordered(olddir);
+
                        d_move(req->r_old_dentry, dn);
                        dout(" src %p '%pd' dst %p '%pd'\n",
                             req->r_old_dentry,
@@ -1222,10 +1231,6 @@ retry_lookup:
                           rehashing bug in vfs_rename_dir */
                        ceph_invalidate_dentry_lease(dn);
 
-                       /* d_move screws up sibling dentries' offsets */
-                       ceph_dir_clear_ordered(dir);
-                       ceph_dir_clear_ordered(olddir);
-
                        dout("dn %p gets new offset %lld\n", req->r_old_dentry,
                             ceph_dentry(req->r_old_dentry)->offset);
 
@@ -1333,6 +1338,49 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
        return err;
 }
 
+void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl)
+{
+       if (ctl->page) {
+               kunmap(ctl->page);
+               page_cache_release(ctl->page);
+               ctl->page = NULL;
+       }
+}
+
+static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
+                             struct ceph_readdir_cache_control *ctl,
+                             struct ceph_mds_request *req)
+{
+       struct ceph_inode_info *ci = ceph_inode(dir);
+       unsigned nsize = PAGE_CACHE_SIZE / sizeof(struct dentry*);
+       unsigned idx = ctl->index % nsize;
+       pgoff_t pgoff = ctl->index / nsize;
+
+       if (!ctl->page || pgoff != page_index(ctl->page)) {
+               ceph_readdir_cache_release(ctl);
+               ctl->page  = grab_cache_page(&dir->i_data, pgoff);
+               if (!ctl->page) {
+                       ctl->index = -1;
+                       return -ENOMEM;
+               }
+               /* reading/filling the cache are serialized by
+                * i_mutex, no need to use page lock */
+               unlock_page(ctl->page);
+               ctl->dentries = kmap(ctl->page);
+       }
+
+       if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) &&
+           req->r_dir_ordered_cnt == atomic64_read(&ci->i_ordered_count)) {
+               dout("readdir cache dn %p idx %d\n", dn, ctl->index);
+               ctl->dentries[idx] = dn;
+               ctl->index++;
+       } else {
+               dout("disable readdir cache\n");
+               ctl->index = -1;
+       }
+       return 0;
+}
+
 int ceph_readdir_prepopulate(struct ceph_mds_request *req,
                             struct ceph_mds_session *session)
 {
@@ -1345,8 +1393,11 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
        struct inode *snapdir = NULL;
        struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
        struct ceph_dentry_info *di;
-       u64 r_readdir_offset = req->r_readdir_offset;
        u32 frag = le32_to_cpu(rhead->args.readdir.frag);
+       struct ceph_readdir_cache_control cache_ctl = {};
+
+       if (req->r_aborted)
+               return readdir_prepopulate_inodes_only(req, session);
 
        if (rinfo->dir_dir &&
            le32_to_cpu(rinfo->dir_dir->frag) != frag) {
@@ -1354,14 +1405,11 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
                     frag, le32_to_cpu(rinfo->dir_dir->frag));
                frag = le32_to_cpu(rinfo->dir_dir->frag);
                if (ceph_frag_is_leftmost(frag))
-                       r_readdir_offset = 2;
+                       req->r_readdir_offset = 2;
                else
-                       r_readdir_offset = 0;
+                       req->r_readdir_offset = 0;
        }
 
-       if (req->r_aborted)
-               return readdir_prepopulate_inodes_only(req, session);
-
        if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
                snapdir = ceph_get_snapdir(d_inode(parent));
                parent = d_find_alias(snapdir);
@@ -1374,6 +1422,17 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
                        ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir);
        }
 
+       if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) {
+               /* note dir version at start of readdir so we can tell
+                * if any dentries get dropped */
+               struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
+               req->r_dir_release_cnt = atomic64_read(&ci->i_release_count);
+               req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count);
+               req->r_readdir_cache_idx = 0;
+       }
+
+       cache_ctl.index = req->r_readdir_cache_idx;
+
        /* FIXME: release caps/leases if error occurs */
        for (i = 0; i < rinfo->dir_nr; i++) {
                struct ceph_vino vino;
@@ -1413,13 +1472,6 @@ retry_lookup:
                        d_delete(dn);
                        dput(dn);
                        goto retry_lookup;
-               } else {
-                       /* reorder parent's d_subdirs */
-                       spin_lock(&parent->d_lock);
-                       spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
-                       list_move(&dn->d_child, &parent->d_subdirs);
-                       spin_unlock(&dn->d_lock);
-                       spin_unlock(&parent->d_lock);
                }
 
                /* inode */
@@ -1436,13 +1488,15 @@ retry_lookup:
                        }
                }
 
-               if (fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
-                              req->r_request_started, -1,
-                              &req->r_caps_reservation) < 0) {
+               ret = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
+                                req->r_request_started, -1,
+                                &req->r_caps_reservation);
+               if (ret < 0) {
                        pr_err("fill_inode badness on %p\n", in);
                        if (d_really_is_negative(dn))
                                iput(in);
                        d_drop(dn);
+                       err = ret;
                        goto next_item;
                }
 
@@ -1458,19 +1512,28 @@ retry_lookup:
                }
 
                di = dn->d_fsdata;
-               di->offset = ceph_make_fpos(frag, i + r_readdir_offset);
+               di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
 
                update_dentry_lease(dn, rinfo->dir_dlease[i],
                                    req->r_session,
                                    req->r_request_started);
+
+               if (err == 0 && cache_ctl.index >= 0) {
+                       ret = fill_readdir_cache(d_inode(parent), dn,
+                                                &cache_ctl, req);
+                       if (ret < 0)
+                               err = ret;
+               }
 next_item:
                if (dn)
                        dput(dn);
        }
-       if (err == 0)
-               req->r_did_prepopulate = true;
-
 out:
+       if (err == 0) {
+               req->r_did_prepopulate = true;
+               req->r_readdir_cache_idx = cache_ctl.index;
+       }
+       ceph_readdir_cache_release(&cache_ctl);
        if (snapdir) {
                iput(snapdir);
                dput(parent);
@@ -1691,16 +1754,9 @@ retry:
 /*
  * symlinks
  */
-static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-       struct ceph_inode_info *ci = ceph_inode(d_inode(dentry));
-       nd_set_link(nd, ci->i_symlink);
-       return NULL;
-}
-
 static const struct inode_operations ceph_symlink_iops = {
        .readlink = generic_readlink,
-       .follow_link = ceph_sym_follow_link,
+       .follow_link = simple_follow_link,
        .setattr = ceph_setattr,
        .getattr = ceph_getattr,
        .setxattr = ceph_setxattr,
@@ -1719,11 +1775,13 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
        const unsigned int ia_valid = attr->ia_valid;
        struct ceph_mds_request *req;
        struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
+       struct ceph_cap_flush *prealloc_cf;
        int issued;
        int release = 0, dirtied = 0;
        int mask = 0;
        int err = 0;
        int inode_dirty_flags = 0;
+       bool lock_snap_rwsem = false;
 
        if (ceph_snap(inode) != CEPH_NOSNAP)
                return -EROFS;
@@ -1732,13 +1790,31 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
        if (err != 0)
                return err;
 
+       prealloc_cf = ceph_alloc_cap_flush();
+       if (!prealloc_cf)
+               return -ENOMEM;
+
        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
                                       USE_AUTH_MDS);
-       if (IS_ERR(req))
+       if (IS_ERR(req)) {
+               ceph_free_cap_flush(prealloc_cf);
                return PTR_ERR(req);
+       }
 
        spin_lock(&ci->i_ceph_lock);
        issued = __ceph_caps_issued(ci, NULL);
+
+       if (!ci->i_head_snapc &&
+           (issued & (CEPH_CAP_ANY_EXCL | CEPH_CAP_FILE_WR))) {
+               lock_snap_rwsem = true;
+               if (!down_read_trylock(&mdsc->snap_rwsem)) {
+                       spin_unlock(&ci->i_ceph_lock);
+                       down_read(&mdsc->snap_rwsem);
+                       spin_lock(&ci->i_ceph_lock);
+                       issued = __ceph_caps_issued(ci, NULL);
+               }
+       }
+
        dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
 
        if (ia_valid & ATTR_UID) {
@@ -1881,12 +1957,15 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
                dout("setattr %p ATTR_FILE ... hrm!\n", inode);
 
        if (dirtied) {
-               inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied);
+               inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied,
+                                                          &prealloc_cf);
                inode->i_ctime = CURRENT_TIME;
        }
 
        release &= issued;
        spin_unlock(&ci->i_ceph_lock);
+       if (lock_snap_rwsem)
+               up_read(&mdsc->snap_rwsem);
 
        if (inode_dirty_flags)
                __mark_inode_dirty(inode, inode_dirty_flags);
@@ -1911,9 +1990,11 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
        ceph_mdsc_put_request(req);
        if (mask & CEPH_SETATTR_SIZE)
                __ceph_do_pending_vmtruncate(inode);
+       ceph_free_cap_flush(prealloc_cf);
        return err;
 out_put:
        ceph_mdsc_put_request(req);
+       ceph_free_cap_flush(prealloc_cf);
        return err;
 }