These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / fs / ceph / dir.c
index 4248307..9314b4e 100644 (file)
@@ -38,7 +38,7 @@ int ceph_init_dentry(struct dentry *dentry)
        if (dentry->d_fsdata)
                return 0;
 
-       di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO);
+       di = kmem_cache_alloc(ceph_dentry_cachep, GFP_KERNEL | __GFP_ZERO);
        if (!di)
                return -ENOMEM;          /* oh well */
 
@@ -106,6 +106,27 @@ static int fpos_cmp(loff_t l, loff_t r)
        return (int)(fpos_off(l) - fpos_off(r));
 }
 
+/*
+ * make note of the last dentry we read, so we can
+ * continue at the same lexicographical point,
+ * regardless of what dir changes take place on the
+ * server.
+ */
+static int note_last_dentry(struct ceph_file_info *fi, const char *name,
+                           int len, unsigned next_offset)
+{
+       char *buf = kmalloc(len+1, GFP_KERNEL);
+       if (!buf)
+               return -ENOMEM;
+       kfree(fi->last_name);
+       fi->last_name = buf;
+       memcpy(fi->last_name, name, len);
+       fi->last_name[len] = 0;
+       fi->next_offset = next_offset;
+       dout("note_last_dentry '%s'\n", fi->last_name);
+       return 0;
+}
+
 /*
  * When possible, we try to satisfy a readdir by peeking at the
  * dcache.  We make this work by carefully ordering dentries on
@@ -123,123 +144,113 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
        struct ceph_file_info *fi = file->private_data;
        struct dentry *parent = file->f_path.dentry;
        struct inode *dir = d_inode(parent);
-       struct list_head *p;
-       struct dentry *dentry, *last;
+       struct dentry *dentry, *last = NULL;
        struct ceph_dentry_info *di;
+       unsigned nsize = PAGE_CACHE_SIZE / sizeof(struct dentry *);
        int err = 0;
+       loff_t ptr_pos = 0;
+       struct ceph_readdir_cache_control cache_ctl = {};
 
-       /* claim ref on last dentry we returned */
-       last = fi->dentry;
-       fi->dentry = NULL;
-
-       dout("__dcache_readdir %p v%u at %llu (last %p)\n",
-            dir, shared_gen, ctx->pos, last);
+       dout("__dcache_readdir %p v%u at %llu\n", dir, shared_gen, ctx->pos);
 
-       spin_lock(&parent->d_lock);
-
-       /* start at beginning? */
-       if (ctx->pos == 2 || last == NULL ||
-           fpos_cmp(ctx->pos, ceph_dentry(last)->offset) < 0) {
-               if (list_empty(&parent->d_subdirs))
-                       goto out_unlock;
-               p = parent->d_subdirs.prev;
-               dout(" initial p %p/%p\n", p->prev, p->next);
-       } else {
-               p = last->d_child.prev;
+       /* we can calculate cache index for the first dirfrag */
+       if (ceph_frag_is_leftmost(fpos_frag(ctx->pos))) {
+               cache_ctl.index = fpos_off(ctx->pos) - 2;
+               BUG_ON(cache_ctl.index < 0);
+               ptr_pos = cache_ctl.index * sizeof(struct dentry *);
        }
 
-more:
-       dentry = list_entry(p, struct dentry, d_child);
-       di = ceph_dentry(dentry);
-       while (1) {
-               dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next,
-                    d_unhashed(dentry) ? "!hashed" : "hashed",
-                    parent->d_subdirs.prev, parent->d_subdirs.next);
-               if (p == &parent->d_subdirs) {
+       while (true) {
+               pgoff_t pgoff;
+               bool emit_dentry;
+
+               if (ptr_pos >= i_size_read(dir)) {
                        fi->flags |= CEPH_F_ATEND;
-                       goto out_unlock;
+                       err = 0;
+                       break;
+               }
+
+               err = -EAGAIN;
+               pgoff = ptr_pos >> PAGE_CACHE_SHIFT;
+               if (!cache_ctl.page || pgoff != page_index(cache_ctl.page)) {
+                       ceph_readdir_cache_release(&cache_ctl);
+                       cache_ctl.page = find_lock_page(&dir->i_data, pgoff);
+                       if (!cache_ctl.page) {
+                               dout(" page %lu not found\n", pgoff);
+                               break;
+                       }
+                       /* reading/filling the cache are serialized by
+                        * i_mutex, no need to use page lock */
+                       unlock_page(cache_ctl.page);
+                       cache_ctl.dentries = kmap(cache_ctl.page);
                }
-               spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+
+               rcu_read_lock();
+               spin_lock(&parent->d_lock);
+               /* check i_size again here, because empty directory can be
+                * marked as complete while not holding the i_mutex. */
+               if (ceph_dir_is_complete_ordered(dir) &&
+                   ptr_pos < i_size_read(dir))
+                       dentry = cache_ctl.dentries[cache_ctl.index % nsize];
+               else
+                       dentry = NULL;
+               spin_unlock(&parent->d_lock);
+               if (dentry && !lockref_get_not_dead(&dentry->d_lockref))
+                       dentry = NULL;
+               rcu_read_unlock();
+               if (!dentry)
+                       break;
+
+               emit_dentry = false;
+               di = ceph_dentry(dentry);
+               spin_lock(&dentry->d_lock);
                if (di->lease_shared_gen == shared_gen &&
-                   !d_unhashed(dentry) && d_really_is_positive(dentry) &&
+                   d_really_is_positive(dentry) &&
                    ceph_snap(d_inode(dentry)) != CEPH_SNAPDIR &&
                    ceph_ino(d_inode(dentry)) != CEPH_INO_CEPH &&
-                   fpos_cmp(ctx->pos, di->offset) <= 0)
-                       break;
-               dout(" skipping %p %pd at %llu (%llu)%s%s\n", dentry,
-                    dentry, di->offset,
-                    ctx->pos, d_unhashed(dentry) ? " unhashed" : "",
-                    !d_inode(dentry) ? " null" : "");
+                   fpos_cmp(ctx->pos, di->offset) <= 0) {
+                       emit_dentry = true;
+               }
                spin_unlock(&dentry->d_lock);
-               p = p->prev;
-               dentry = list_entry(p, struct dentry, d_child);
-               di = ceph_dentry(dentry);
-       }
-
-       dget_dlock(dentry);
-       spin_unlock(&dentry->d_lock);
-       spin_unlock(&parent->d_lock);
 
-       /* make sure a dentry wasn't dropped while we didn't have parent lock */
-       if (!ceph_dir_is_complete_ordered(dir)) {
-               dout(" lost dir complete on %p; falling back to mds\n", dir);
-               dput(dentry);
-               err = -EAGAIN;
-               goto out;
-       }
+               if (emit_dentry) {
+                       dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos,
+                            dentry, dentry, d_inode(dentry));
+                       ctx->pos = di->offset;
+                       if (!dir_emit(ctx, dentry->d_name.name,
+                                     dentry->d_name.len,
+                                     ceph_translate_ino(dentry->d_sb,
+                                                        d_inode(dentry)->i_ino),
+                                     d_inode(dentry)->i_mode >> 12)) {
+                               dput(dentry);
+                               err = 0;
+                               break;
+                       }
+                       ctx->pos++;
 
-       dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos,
-            dentry, dentry, d_inode(dentry));
-       if (!dir_emit(ctx, dentry->d_name.name,
-                     dentry->d_name.len,
-                     ceph_translate_ino(dentry->d_sb, d_inode(dentry)->i_ino),
-                     d_inode(dentry)->i_mode >> 12)) {
-               if (last) {
-                       /* remember our position */
-                       fi->dentry = last;
-                       fi->next_offset = fpos_off(di->offset);
+                       if (last)
+                               dput(last);
+                       last = dentry;
+               } else {
+                       dput(dentry);
                }
-               dput(dentry);
-               return 0;
-       }
-
-       ctx->pos = di->offset + 1;
 
-       if (last)
-               dput(last);
-       last = dentry;
-
-       spin_lock(&parent->d_lock);
-       p = p->prev;    /* advance to next dentry */
-       goto more;
-
-out_unlock:
-       spin_unlock(&parent->d_lock);
-out:
-       if (last)
+               cache_ctl.index++;
+               ptr_pos += sizeof(struct dentry *);
+       }
+       ceph_readdir_cache_release(&cache_ctl);
+       if (last) {
+               int ret;
+               di = ceph_dentry(last);
+               ret = note_last_dentry(fi, last->d_name.name, last->d_name.len,
+                                      fpos_off(di->offset) + 1);
+               if (ret < 0)
+                       err = ret;
                dput(last);
+       }
        return err;
 }
 
-/*
- * make note of the last dentry we read, so we can
- * continue at the same lexicographical point,
- * regardless of what dir changes take place on the
- * server.
- */
-static int note_last_dentry(struct ceph_file_info *fi, const char *name,
-                           int len)
-{
-       kfree(fi->last_name);
-       fi->last_name = kmalloc(len+1, GFP_NOFS);
-       if (!fi->last_name)
-               return -ENOMEM;
-       memcpy(fi->last_name, name, len);
-       fi->last_name[len] = 0;
-       dout("note_last_dentry '%s'\n", fi->last_name);
-       return 0;
-}
-
 static int ceph_readdir(struct file *file, struct dir_context *ctx)
 {
        struct ceph_file_info *fi = file->private_data;
@@ -280,8 +291,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 
        /* can we use the dcache? */
        spin_lock(&ci->i_ceph_lock);
-       if ((ctx->pos == 2 || fi->dentry) &&
-           ceph_test_mount_opt(fsc, DCACHE) &&
+       if (ceph_test_mount_opt(fsc, DCACHE) &&
            !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
            ceph_snap(inode) != CEPH_SNAPDIR &&
            __ceph_dir_is_complete_ordered(ci) &&
@@ -296,24 +306,8 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
        } else {
                spin_unlock(&ci->i_ceph_lock);
        }
-       if (fi->dentry) {
-               err = note_last_dentry(fi, fi->dentry->d_name.name,
-                                      fi->dentry->d_name.len);
-               if (err)
-                       return err;
-               dput(fi->dentry);
-               fi->dentry = NULL;
-       }
 
        /* proceed with a normal readdir */
-
-       if (ctx->pos == 2) {
-               /* note dir version at start of readdir so we can tell
-                * if any dentries get dropped */
-               fi->dir_release_count = atomic_read(&ci->i_release_count);
-               fi->dir_ordered_count = ci->i_ordered_count;
-       }
-
 more:
        /* do we have the correct frag content buffered? */
        if (fi->frag != frag || fi->last_readdir == NULL) {
@@ -342,12 +336,15 @@ more:
                req->r_direct_hash = ceph_frag_value(frag);
                req->r_direct_is_hash = true;
                if (fi->last_name) {
-                       req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
+                       req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL);
                        if (!req->r_path2) {
                                ceph_mdsc_put_request(req);
                                return -ENOMEM;
                        }
                }
+               req->r_dir_release_cnt = fi->dir_release_count;
+               req->r_dir_ordered_cnt = fi->dir_ordered_count;
+               req->r_readdir_cache_idx = fi->readdir_cache_idx;
                req->r_readdir_offset = fi->next_offset;
                req->r_args.readdir.frag = cpu_to_le32(frag);
 
@@ -364,26 +361,38 @@ more:
                     (int)req->r_reply_info.dir_end,
                     (int)req->r_reply_info.dir_complete);
 
-               if (!req->r_did_prepopulate) {
-                       dout("readdir !did_prepopulate");
-                       /* preclude from marking dir complete */
-                       fi->dir_release_count--;
-               }
 
                /* note next offset and last dentry name */
                rinfo = &req->r_reply_info;
                if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
                        frag = le32_to_cpu(rinfo->dir_dir->frag);
-                       if (ceph_frag_is_leftmost(frag))
-                               fi->next_offset = 2;
-                       else
-                               fi->next_offset = 0;
-                       off = fi->next_offset;
+                       off = req->r_readdir_offset;
+                       fi->next_offset = off;
                }
+
                fi->frag = frag;
                fi->offset = fi->next_offset;
                fi->last_readdir = req;
 
+               if (req->r_did_prepopulate) {
+                       fi->readdir_cache_idx = req->r_readdir_cache_idx;
+                       if (fi->readdir_cache_idx < 0) {
+                               /* preclude from marking dir ordered */
+                               fi->dir_ordered_count = 0;
+                       } else if (ceph_frag_is_leftmost(frag) && off == 2) {
+                               /* note dir version at start of readdir so
+                                * we can tell if any dentries get dropped */
+                               fi->dir_release_count = req->r_dir_release_cnt;
+                               fi->dir_ordered_count = req->r_dir_ordered_cnt;
+                       }
+               } else {
+                       dout("readdir !did_prepopulate");
+                       /* disable readdir cache */
+                       fi->readdir_cache_idx = -1;
+                       /* preclude from marking dir complete */
+                       fi->dir_release_count = 0;
+               }
+
                if (req->r_reply_info.dir_end) {
                        kfree(fi->last_name);
                        fi->last_name = NULL;
@@ -394,10 +403,10 @@ more:
                } else {
                        err = note_last_dentry(fi,
                                       rinfo->dir_dname[rinfo->dir_nr-1],
-                                      rinfo->dir_dname_len[rinfo->dir_nr-1]);
+                                      rinfo->dir_dname_len[rinfo->dir_nr-1],
+                                      fi->next_offset + rinfo->dir_nr);
                        if (err)
                                return err;
-                       fi->next_offset += rinfo->dir_nr;
                }
        }
 
@@ -453,16 +462,22 @@ more:
         * were released during the whole readdir, and we should have
         * the complete dir contents in our cache.
         */
-       spin_lock(&ci->i_ceph_lock);
-       if (atomic_read(&ci->i_release_count) == fi->dir_release_count) {
-               if (ci->i_ordered_count == fi->dir_ordered_count)
+       if (atomic64_read(&ci->i_release_count) == fi->dir_release_count) {
+               spin_lock(&ci->i_ceph_lock);
+               if (fi->dir_ordered_count == atomic64_read(&ci->i_ordered_count)) {
                        dout(" marking %p complete and ordered\n", inode);
-               else
+                       /* use i_size to track number of entries in
+                        * readdir cache */
+                       BUG_ON(fi->readdir_cache_idx < 0);
+                       i_size_write(inode, fi->readdir_cache_idx *
+                                    sizeof(struct dentry*));
+               } else {
                        dout(" marking %p complete\n", inode);
+               }
                __ceph_dir_set_complete(ci, fi->dir_release_count,
                                        fi->dir_ordered_count);
+               spin_unlock(&ci->i_ceph_lock);
        }
-       spin_unlock(&ci->i_ceph_lock);
 
        dout("readdir %p file %p done.\n", inode, file);
        return 0;
@@ -476,14 +491,12 @@ static void reset_readdir(struct ceph_file_info *fi, unsigned frag)
        }
        kfree(fi->last_name);
        fi->last_name = NULL;
+       fi->dir_release_count = 0;
+       fi->readdir_cache_idx = -1;
        if (ceph_frag_is_leftmost(frag))
                fi->next_offset = 2;  /* compensate for . and .. */
        else
                fi->next_offset = 0;
-       if (fi->dentry) {
-               dput(fi->dentry);
-               fi->dentry = NULL;
-       }
        fi->flags &= ~CEPH_F_ATEND;
 }
 
@@ -497,13 +510,12 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
        mutex_lock(&inode->i_mutex);
        retval = -EINVAL;
        switch (whence) {
-       case SEEK_END:
-               offset += inode->i_size + 2;   /* FIXME */
-               break;
        case SEEK_CUR:
                offset += file->f_pos;
        case SEEK_SET:
                break;
+       case SEEK_END:
+               retval = -EOPNOTSUPP;
        default:
                goto out;
        }
@@ -516,20 +528,18 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
                }
                retval = offset;
 
-               /*
-                * discard buffered readdir content on seekdir(0), or
-                * seek to new frag, or seek prior to current chunk.
-                */
                if (offset == 0 ||
                    fpos_frag(offset) != fi->frag ||
                    fpos_off(offset) < fi->offset) {
+                       /* discard buffered readdir content on seekdir(0), or
+                        * seek to new frag, or seek prior to current chunk */
                        dout("dir_llseek dropping %p content\n", file);
                        reset_readdir(fi, fpos_frag(offset));
+               } else if (fpos_cmp(offset, old_offset) > 0) {
+                       /* reset dir_release_count if we did a forward seek */
+                       fi->dir_release_count = 0;
+                       fi->readdir_cache_idx = -1;
                }
-
-               /* bump dir_release_count if we did a forward seek */
-               if (fpos_cmp(offset, old_offset) > 0)
-                       fi->dir_release_count--;
        }
 out:
        mutex_unlock(&inode->i_mutex);
@@ -764,7 +774,7 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
                err = PTR_ERR(req);
                goto out;
        }
-       req->r_path2 = kstrdup(dest, GFP_NOFS);
+       req->r_path2 = kstrdup(dest, GFP_KERNEL);
        if (!req->r_path2) {
                err = -ENOMEM;
                ceph_mdsc_put_request(req);
@@ -985,16 +995,15 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
                 * to do it here.
                 */
 
+               /* d_move screws up sibling dentries' offsets */
+               ceph_dir_clear_complete(old_dir);
+               ceph_dir_clear_complete(new_dir);
+
                d_move(old_dentry, new_dentry);
 
                /* ensure target dentry is invalidated, despite
                   rehashing bug in vfs_rename_dir */
                ceph_invalidate_dentry_lease(new_dentry);
-
-               /* d_move screws up sibling dentries' offsets */
-               ceph_dir_clear_complete(old_dir);
-               ceph_dir_clear_complete(new_dir);
-
        }
        ceph_mdsc_put_request(req);
        return err;
@@ -1189,7 +1198,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
                return -EISDIR;
 
        if (!cf->dir_info) {
-               cf->dir_info = kmalloc(bufsize, GFP_NOFS);
+               cf->dir_info = kmalloc(bufsize, GFP_KERNEL);
                if (!cf->dir_info)
                        return -ENOMEM;
                cf->dir_info_len =
@@ -1223,66 +1232,6 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
        return size - left;
 }
 
-/*
- * an fsync() on a dir will wait for any uncommitted directory
- * operations to commit.
- */
-static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,
-                         int datasync)
-{
-       struct inode *inode = file_inode(file);
-       struct ceph_inode_info *ci = ceph_inode(inode);
-       struct list_head *head = &ci->i_unsafe_dirops;
-       struct ceph_mds_request *req;
-       u64 last_tid;
-       int ret = 0;
-
-       dout("dir_fsync %p\n", inode);
-       ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
-       if (ret)
-               return ret;
-       mutex_lock(&inode->i_mutex);
-
-       spin_lock(&ci->i_unsafe_lock);
-       if (list_empty(head))
-               goto out;
-
-       req = list_entry(head->prev,
-                        struct ceph_mds_request, r_unsafe_dir_item);
-       last_tid = req->r_tid;
-
-       do {
-               ceph_mdsc_get_request(req);
-               spin_unlock(&ci->i_unsafe_lock);
-
-               dout("dir_fsync %p wait on tid %llu (until %llu)\n",
-                    inode, req->r_tid, last_tid);
-               if (req->r_timeout) {
-                       unsigned long time_left = wait_for_completion_timeout(
-                                                       &req->r_safe_completion,
-                                                       req->r_timeout);
-                       if (time_left > 0)
-                               ret = 0;
-                       else
-                               ret = -EIO;  /* timed out */
-               } else {
-                       wait_for_completion(&req->r_safe_completion);
-               }
-               ceph_mdsc_put_request(req);
-
-               spin_lock(&ci->i_unsafe_lock);
-               if (ret || list_empty(head))
-                       break;
-               req = list_entry(head->next,
-                                struct ceph_mds_request, r_unsafe_dir_item);
-       } while (req->r_tid < last_tid);
-out:
-       spin_unlock(&ci->i_unsafe_lock);
-       mutex_unlock(&inode->i_mutex);
-
-       return ret;
-}
-
 /*
  * We maintain a private dentry LRU.
  *
@@ -1353,7 +1302,7 @@ const struct file_operations ceph_dir_fops = {
        .open = ceph_open,
        .release = ceph_release,
        .unlocked_ioctl = ceph_ioctl,
-       .fsync = ceph_dir_fsync,
+       .fsync = ceph_fsync,
 };
 
 const struct file_operations ceph_snapdir_fops = {