These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / fs / nfs / flexfilelayout / flexfilelayout.c
index fecd920..2a2e2d8 100644 (file)
@@ -20,6 +20,7 @@
 #include "../nfs4trace.h"
 #include "../iostat.h"
 #include "../nfs.h"
+#include "../nfs42.h"
 
 #define NFSDBG_FACILITY         NFSDBG_PNFS_LD
 
@@ -33,6 +34,7 @@ ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
        ffl = kzalloc(sizeof(*ffl), gfp_flags);
        if (ffl) {
                INIT_LIST_HEAD(&ffl->error_list);
+               INIT_LIST_HEAD(&ffl->mirrors);
                return &ffl->generic_hdr;
        } else
                return NULL;
@@ -134,6 +136,95 @@ decode_name(struct xdr_stream *xdr, u32 *id)
        return 0;
 }
 
+static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1,
+               const struct nfs4_ff_layout_mirror *m2)
+{
+       int i, j;
+
+       if (m1->fh_versions_cnt != m2->fh_versions_cnt)
+               return false;
+       for (i = 0; i < m1->fh_versions_cnt; i++) {
+               bool found_fh = false;
+               for (j = 0; j < m2->fh_versions_cnt; j++) {
+                       if (nfs_compare_fh(&m1->fh_versions[i],
+                                       &m2->fh_versions[j]) == 0) {
+                               found_fh = true;
+                               break;
+                       }
+               }
+               if (!found_fh)
+                       return false;
+       }
+       return true;
+}
+
+static struct nfs4_ff_layout_mirror *
+ff_layout_add_mirror(struct pnfs_layout_hdr *lo,
+               struct nfs4_ff_layout_mirror *mirror)
+{
+       struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo);
+       struct nfs4_ff_layout_mirror *pos;
+       struct inode *inode = lo->plh_inode;
+
+       spin_lock(&inode->i_lock);
+       list_for_each_entry(pos, &ff_layout->mirrors, mirrors) {
+               if (mirror->mirror_ds != pos->mirror_ds)
+                       continue;
+               if (!ff_mirror_match_fh(mirror, pos))
+                       continue;
+               if (atomic_inc_not_zero(&pos->ref)) {
+                       spin_unlock(&inode->i_lock);
+                       return pos;
+               }
+       }
+       list_add(&mirror->mirrors, &ff_layout->mirrors);
+       mirror->layout = lo;
+       spin_unlock(&inode->i_lock);
+       return mirror;
+}
+
+static void
+ff_layout_remove_mirror(struct nfs4_ff_layout_mirror *mirror)
+{
+       struct inode *inode;
+       if (mirror->layout == NULL)
+               return;
+       inode = mirror->layout->plh_inode;
+       spin_lock(&inode->i_lock);
+       list_del(&mirror->mirrors);
+       spin_unlock(&inode->i_lock);
+       mirror->layout = NULL;
+}
+
+static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags)
+{
+       struct nfs4_ff_layout_mirror *mirror;
+
+       mirror = kzalloc(sizeof(*mirror), gfp_flags);
+       if (mirror != NULL) {
+               spin_lock_init(&mirror->lock);
+               atomic_set(&mirror->ref, 1);
+               INIT_LIST_HEAD(&mirror->mirrors);
+       }
+       return mirror;
+}
+
+static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror)
+{
+       ff_layout_remove_mirror(mirror);
+       kfree(mirror->fh_versions);
+       if (mirror->cred)
+               put_rpccred(mirror->cred);
+       nfs4_ff_layout_put_deviceid(mirror->mirror_ds);
+       kfree(mirror);
+}
+
+static void ff_layout_put_mirror(struct nfs4_ff_layout_mirror *mirror)
+{
+       if (mirror != NULL && atomic_dec_and_test(&mirror->ref))
+               ff_layout_free_mirror(mirror);
+}
+
 static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls)
 {
        int i;
@@ -143,11 +234,7 @@ static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls)
                        /* normally mirror_ds is freed in
                         * .free_deviceid_node but we still do it here
                         * for .alloc_lseg error path */
-                       if (fls->mirror_array[i]) {
-                               kfree(fls->mirror_array[i]->fh_versions);
-                               nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds);
-                               kfree(fls->mirror_array[i]);
-                       }
+                       ff_layout_put_mirror(fls->mirror_array[i]);
                }
                kfree(fls->mirror_array);
                fls->mirror_array = NULL;
@@ -180,19 +267,88 @@ static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls)
        }
 }
 
+static bool
+ff_lseg_range_is_after(const struct pnfs_layout_range *l1,
+               const struct pnfs_layout_range *l2)
+{
+       u64 end1, end2;
+
+       if (l1->iomode != l2->iomode)
+               return l1->iomode != IOMODE_READ;
+       end1 = pnfs_calc_offset_end(l1->offset, l1->length);
+       end2 = pnfs_calc_offset_end(l2->offset, l2->length);
+       if (end1 < l2->offset)
+               return false;
+       if (end2 < l1->offset)
+               return true;
+       return l2->offset <= l1->offset;
+}
+
+static bool
+ff_lseg_merge(struct pnfs_layout_segment *new,
+               struct pnfs_layout_segment *old)
+{
+       u64 new_end, old_end;
+
+       if (new->pls_range.iomode != old->pls_range.iomode)
+               return false;
+       old_end = pnfs_calc_offset_end(old->pls_range.offset,
+                       old->pls_range.length);
+       if (old_end < new->pls_range.offset)
+               return false;
+       new_end = pnfs_calc_offset_end(new->pls_range.offset,
+                       new->pls_range.length);
+       if (new_end < old->pls_range.offset)
+               return false;
+
+       /* Mergeable: copy info from 'old' to 'new' */
+       if (new_end < old_end)
+               new_end = old_end;
+       if (new->pls_range.offset < old->pls_range.offset)
+               new->pls_range.offset = old->pls_range.offset;
+       new->pls_range.length = pnfs_calc_offset_length(new->pls_range.offset,
+                       new_end);
+       if (test_bit(NFS_LSEG_ROC, &old->pls_flags))
+               set_bit(NFS_LSEG_ROC, &new->pls_flags);
+       if (test_bit(NFS_LSEG_LAYOUTRETURN, &old->pls_flags))
+               set_bit(NFS_LSEG_LAYOUTRETURN, &new->pls_flags);
+       return true;
+}
+
+static void
+ff_layout_add_lseg(struct pnfs_layout_hdr *lo,
+               struct pnfs_layout_segment *lseg,
+               struct list_head *free_me)
+{
+       pnfs_generic_layout_insert_lseg(lo, lseg,
+                       ff_lseg_range_is_after,
+                       ff_lseg_merge,
+                       free_me);
+}
+
 static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
 {
-       struct nfs4_ff_layout_mirror *tmp;
        int i, j;
 
        for (i = 0; i < fls->mirror_array_cnt - 1; i++) {
                for (j = i + 1; j < fls->mirror_array_cnt; j++)
                        if (fls->mirror_array[i]->efficiency <
-                           fls->mirror_array[j]->efficiency) {
-                               tmp = fls->mirror_array[i];
-                               fls->mirror_array[i] = fls->mirror_array[j];
-                               fls->mirror_array[j] = tmp;
-                       }
+                           fls->mirror_array[j]->efficiency)
+                               swap(fls->mirror_array[i],
+                                    fls->mirror_array[j]);
+       }
+}
+
+static void ff_layout_mark_devices_valid(struct nfs4_ff_layout_segment *fls)
+{
+       struct nfs4_deviceid_node *node;
+       int i;
+
+       if (!(fls->flags & FF_FLAGS_NO_IO_THRU_MDS))
+               return;
+       for (i = 0; i < fls->mirror_array_cnt; i++) {
+               node = &fls->mirror_array[i]->mirror_ds->id_node;
+               clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags);
        }
 }
 
@@ -248,6 +404,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
                goto out_err_free;
 
        for (i = 0; i < fls->mirror_array_cnt; i++) {
+               struct nfs4_ff_layout_mirror *mirror;
                struct nfs4_deviceid devid;
                struct nfs4_deviceid_node *idnode;
                u32 ds_count;
@@ -264,15 +421,12 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
                if (ds_count != 1)
                        goto out_err_free;
 
-               fls->mirror_array[i] =
-                       kzalloc(sizeof(struct nfs4_ff_layout_mirror),
-                               gfp_flags);
+               fls->mirror_array[i] = ff_layout_alloc_mirror(gfp_flags);
                if (fls->mirror_array[i] == NULL) {
                        rc = -ENOMEM;
                        goto out_err_free;
                }
 
-               spin_lock_init(&fls->mirror_array[i]->lock);
                fls->mirror_array[i]->ds_count = ds_count;
 
                /* deviceid */
@@ -339,15 +493,26 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
                if (rc)
                        goto out_err_free;
 
+               mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]);
+               if (mirror != fls->mirror_array[i]) {
+                       ff_layout_free_mirror(fls->mirror_array[i]);
+                       fls->mirror_array[i] = mirror;
+               }
+
                dprintk("%s: uid %d gid %d\n", __func__,
                        fls->mirror_array[i]->uid,
                        fls->mirror_array[i]->gid);
        }
 
+       p = xdr_inline_decode(&stream, 4);
+       if (p)
+               fls->flags = be32_to_cpup(p);
+
        ff_layout_sort_mirrors(fls);
        rc = ff_layout_check_layout(lgr);
        if (rc)
                goto out_err_free;
+       ff_layout_mark_devices_valid(fls);
 
        ret = &fls->generic_hdr;
        dprintk("<-- %s (success)\n", __func__);
@@ -376,21 +541,9 @@ static void
 ff_layout_free_lseg(struct pnfs_layout_segment *lseg)
 {
        struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
-       int i;
 
        dprintk("--> %s\n", __func__);
 
-       for (i = 0; i < fls->mirror_array_cnt; i++) {
-               if (fls->mirror_array[i]) {
-                       nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds);
-                       fls->mirror_array[i]->mirror_ds = NULL;
-                       if (fls->mirror_array[i]->cred) {
-                               put_rpccred(fls->mirror_array[i]->cred);
-                               fls->mirror_array[i]->cred = NULL;
-                       }
-               }
-       }
-
        if (lseg->pls_range.iomode == IOMODE_RW) {
                struct nfs4_flexfile_layout *ffl;
                struct inode *inode;
@@ -415,6 +568,146 @@ ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls)
        return 1;
 }
 
+static void
+nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now)
+{
+       /* first IO request? */
+       if (atomic_inc_return(&timer->n_ops) == 1) {
+               timer->start_time = now;
+       }
+}
+
+static ktime_t
+nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now)
+{
+       ktime_t start;
+
+       if (atomic_dec_return(&timer->n_ops) < 0)
+               WARN_ON_ONCE(1);
+
+       start = timer->start_time;
+       timer->start_time = now;
+       return ktime_sub(now, start);
+}
+
+static bool
+nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
+                           struct nfs4_ff_layoutstat *layoutstat,
+                           ktime_t now)
+{
+       static const ktime_t notime = {0};
+       s64 report_interval = FF_LAYOUTSTATS_REPORT_INTERVAL;
+
+       nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now);
+       if (ktime_equal(mirror->start_time, notime))
+               mirror->start_time = now;
+       if (ktime_equal(mirror->last_report_time, notime))
+               mirror->last_report_time = now;
+       if (layoutstats_timer != 0)
+               report_interval = (s64)layoutstats_timer * 1000LL;
+       if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >=
+                       report_interval) {
+               mirror->last_report_time = now;
+               return true;
+       }
+
+       return false;
+}
+
+static void
+nfs4_ff_layout_stat_io_update_requested(struct nfs4_ff_layoutstat *layoutstat,
+               __u64 requested)
+{
+       struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat;
+
+       iostat->ops_requested++;
+       iostat->bytes_requested += requested;
+}
+
+static void
+nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat *layoutstat,
+               __u64 requested,
+               __u64 completed,
+               ktime_t time_completed,
+               ktime_t time_started)
+{
+       struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat;
+       ktime_t completion_time = ktime_sub(time_completed, time_started);
+       ktime_t timer;
+
+       iostat->ops_completed++;
+       iostat->bytes_completed += completed;
+       iostat->bytes_not_delivered += requested - completed;
+
+       timer = nfs4_ff_end_busy_timer(&layoutstat->busy_timer, time_completed);
+       iostat->total_busy_time =
+                       ktime_add(iostat->total_busy_time, timer);
+       iostat->aggregate_completion_time =
+                       ktime_add(iostat->aggregate_completion_time,
+                                       completion_time);
+}
+
+static void
+nfs4_ff_layout_stat_io_start_read(struct inode *inode,
+               struct nfs4_ff_layout_mirror *mirror,
+               __u64 requested, ktime_t now)
+{
+       bool report;
+
+       spin_lock(&mirror->lock);
+       report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat, now);
+       nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested);
+       spin_unlock(&mirror->lock);
+
+       if (report)
+               pnfs_report_layoutstat(inode, GFP_KERNEL);
+}
+
+static void
+nfs4_ff_layout_stat_io_end_read(struct rpc_task *task,
+               struct nfs4_ff_layout_mirror *mirror,
+               __u64 requested,
+               __u64 completed)
+{
+       spin_lock(&mirror->lock);
+       nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat,
+                       requested, completed,
+                       ktime_get(), task->tk_start);
+       spin_unlock(&mirror->lock);
+}
+
+static void
+nfs4_ff_layout_stat_io_start_write(struct inode *inode,
+               struct nfs4_ff_layout_mirror *mirror,
+               __u64 requested, ktime_t now)
+{
+       bool report;
+
+       spin_lock(&mirror->lock);
+       report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat, now);
+       nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested);
+       spin_unlock(&mirror->lock);
+
+       if (report)
+               pnfs_report_layoutstat(inode, GFP_NOIO);
+}
+
+static void
+nfs4_ff_layout_stat_io_end_write(struct rpc_task *task,
+               struct nfs4_ff_layout_mirror *mirror,
+               __u64 requested,
+               __u64 completed,
+               enum nfs3_stable_how committed)
+{
+       if (committed == NFS_UNSTABLE)
+               requested = completed = 0;
+
+       spin_lock(&mirror->lock);
+       nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat,
+                       requested, completed, ktime_get(), task->tk_start);
+       spin_unlock(&mirror->lock);
+}
+
 static int
 ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
                            struct nfs_commit_info *cinfo,
@@ -462,17 +755,17 @@ ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
 }
 
 static struct nfs4_pnfs_ds *
-ff_layout_choose_best_ds_for_read(struct nfs_pageio_descriptor *pgio,
+ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
+                                 int start_idx,
                                  int *best_idx)
 {
-       struct nfs4_ff_layout_segment *fls;
+       struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
        struct nfs4_pnfs_ds *ds;
        int idx;
 
-       fls = FF_LAYOUT_LSEG(pgio->pg_lseg);
        /* mirrors are sorted by efficiency */
-       for (idx = 0; idx < fls->mirror_array_cnt; idx++) {
-               ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, idx, false);
+       for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) {
+               ds = nfs4_ff_layout_prepare_ds(lseg, idx, false);
                if (ds) {
                        *best_idx = idx;
                        return ds;
@@ -503,7 +796,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
        if (pgio->pg_lseg == NULL)
                goto out_mds;
 
-       ds = ff_layout_choose_best_ds_for_read(pgio, &ds_idx);
+       ds = ff_layout_choose_best_ds_for_read(pgio->pg_lseg, 0, &ds_idx);
        if (!ds)
                goto out_mds;
        mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
@@ -585,8 +878,6 @@ ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
                return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg);
 
        /* no lseg means that pnfs is not in use, so no mirroring here */
-       pnfs_put_lseg(pgio->pg_lseg);
-       pgio->pg_lseg = NULL;
        nfs_pageio_reset_write_mds(pgio);
        return 1;
 }
@@ -758,7 +1049,8 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
                rpc_wake_up(&tbl->slot_tbl_waitq);
                /* fall through */
        default:
-               if (ff_layout_has_available_ds(lseg))
+               if (ff_layout_no_fallback_to_mds(lseg) ||
+                   ff_layout_has_available_ds(lseg))
                        return -NFS4ERR_RESET_TO_PNFS;
 reset:
                dprintk("%s Retry through MDS. Error %d\n", __func__,
@@ -788,18 +1080,26 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task,
        if (task->tk_status >= 0)
                return 0;
 
-       if (task->tk_status != -EJUKEBOX) {
+       switch (task->tk_status) {
+       /* File access problems. Don't mark the device as unavailable */
+       case -EACCES:
+       case -ESTALE:
+       case -EISDIR:
+       case -EBADHANDLE:
+       case -ELOOP:
+       case -ENOSPC:
+               break;
+       case -EJUKEBOX:
+               nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
+               goto out_retry;
+       default:
                dprintk("%s DS connection error %d\n", __func__,
                        task->tk_status);
                nfs4_mark_deviceid_unavailable(devid);
-               if (ff_layout_has_available_ds(lseg))
-                       return -NFS4ERR_RESET_TO_PNFS;
-               else
-                       return -NFS4ERR_RESET_TO_MDS;
        }
-
-       if (task->tk_status == -EJUKEBOX)
-               nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
+       /* FIXME: Need to prevent infinite looping here. */
+       return -NFS4ERR_RESET_TO_PNFS;
+out_retry:
        task->tk_status = 0;
        rpc_restart_call(task);
        rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
@@ -829,54 +1129,87 @@ static int ff_layout_async_handle_error(struct rpc_task *task,
 
 static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
                                        int idx, u64 offset, u64 length,
-                                       u32 status, int opnum)
+                                       u32 status, int opnum, int error)
 {
        struct nfs4_ff_layout_mirror *mirror;
        int err;
 
+       if (status == 0) {
+               switch (error) {
+               case -ETIMEDOUT:
+               case -EPFNOSUPPORT:
+               case -EPROTONOSUPPORT:
+               case -EOPNOTSUPP:
+               case -ECONNREFUSED:
+               case -ECONNRESET:
+               case -EHOSTDOWN:
+               case -EHOSTUNREACH:
+               case -ENETUNREACH:
+               case -EADDRINUSE:
+               case -ENOBUFS:
+               case -EPIPE:
+               case -EPERM:
+                       status = NFS4ERR_NXIO;
+                       break;
+               case -EACCES:
+                       status = NFS4ERR_ACCESS;
+                       break;
+               default:
+                       return;
+               }
+       }
+
        mirror = FF_LAYOUT_COMP(lseg, idx);
        err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
                                       mirror, offset, length, status, opnum,
                                       GFP_NOIO);
+       pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg);
        dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status);
 }
 
 /* NFS_PROTO call done callback routines */
-
 static int ff_layout_read_done_cb(struct rpc_task *task,
                                struct nfs_pgio_header *hdr)
 {
-       struct inode *inode;
        int err;
 
        trace_nfs4_pnfs_read(hdr, task->tk_status);
-       if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status)
-               hdr->res.op_status = NFS4ERR_NXIO;
-       if (task->tk_status < 0 && hdr->res.op_status)
+       if (task->tk_status < 0)
                ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
                                            hdr->args.offset, hdr->args.count,
-                                           hdr->res.op_status, OP_READ);
+                                           hdr->res.op_status, OP_READ,
+                                           task->tk_status);
        err = ff_layout_async_handle_error(task, hdr->args.context->state,
                                           hdr->ds_clp, hdr->lseg,
                                           hdr->pgio_mirror_idx);
 
        switch (err) {
        case -NFS4ERR_RESET_TO_PNFS:
+               if (ff_layout_choose_best_ds_for_read(hdr->lseg,
+                                       hdr->pgio_mirror_idx + 1,
+                                       &hdr->pgio_mirror_idx))
+                       goto out_eagain;
                set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
                        &hdr->lseg->pls_layout->plh_flags);
                pnfs_read_resend_pnfs(hdr);
                return task->tk_status;
        case -NFS4ERR_RESET_TO_MDS:
-               inode = hdr->lseg->pls_layout->plh_inode;
-               pnfs_error_mark_layout_for_return(inode, hdr->lseg);
                ff_layout_reset_read(hdr);
                return task->tk_status;
        case -EAGAIN:
-               rpc_restart_call_prepare(task);
-               return -EAGAIN;
+               goto out_eagain;
        }
 
        return 0;
+out_eagain:
+       rpc_restart_call_prepare(task);
+       return -EAGAIN;
+}
+
+static bool
+ff_layout_need_layoutcommit(struct pnfs_layout_segment *lseg)
+{
+       return !(FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_LAYOUTCOMMIT);
 }
 
 /*
@@ -891,6 +1224,9 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
 static void
 ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr)
 {
+       if (!ff_layout_need_layoutcommit(hdr->lseg))
+               return;
+
        pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
                        hdr->mds_offset + hdr->res.count);
        dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
@@ -909,6 +1245,11 @@ ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx)
 static int ff_layout_read_prepare_common(struct rpc_task *task,
                                         struct nfs_pgio_header *hdr)
 {
+       nfs4_ff_layout_stat_io_start_read(hdr->inode,
+                       FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+                       hdr->args.count,
+                       task->tk_start);
+
        if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
                rpc_exit(task, -EIO);
                return -EIO;
@@ -962,15 +1303,15 @@ static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data)
 {
        struct nfs_pgio_header *hdr = data;
 
-       if (ff_layout_read_prepare_common(task, hdr))
-               return;
-
        if (ff_layout_setup_sequence(hdr->ds_clp,
                                     &hdr->args.seq_args,
                                     &hdr->res.seq_res,
                                     task))
                return;
 
+       if (ff_layout_read_prepare_common(task, hdr))
+               return;
+
        if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
                        hdr->args.lock_context, FMODE_READ) == -EIO)
                rpc_exit(task, -EIO); /* lost lock, terminate I/O */
@@ -982,6 +1323,10 @@ static void ff_layout_read_call_done(struct rpc_task *task, void *data)
 
        dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
 
+       nfs4_ff_layout_stat_io_end_read(task,
+                       FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+                       hdr->args.count, hdr->res.count);
+
        if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
            task->tk_status == 0) {
                nfs4_sequence_done(task, &hdr->res.seq_res);
@@ -1003,32 +1348,26 @@ static void ff_layout_read_count_stats(struct rpc_task *task, void *data)
 static int ff_layout_write_done_cb(struct rpc_task *task,
                                struct nfs_pgio_header *hdr)
 {
-       struct inode *inode;
        int err;
 
        trace_nfs4_pnfs_write(hdr, task->tk_status);
-       if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status)
-               hdr->res.op_status = NFS4ERR_NXIO;
-       if (task->tk_status < 0 && hdr->res.op_status)
+       if (task->tk_status < 0)
                ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
                                            hdr->args.offset, hdr->args.count,
-                                           hdr->res.op_status, OP_WRITE);
+                                           hdr->res.op_status, OP_WRITE,
+                                           task->tk_status);
        err = ff_layout_async_handle_error(task, hdr->args.context->state,
                                           hdr->ds_clp, hdr->lseg,
                                           hdr->pgio_mirror_idx);
 
        switch (err) {
        case -NFS4ERR_RESET_TO_PNFS:
+               pnfs_set_retry_layoutget(hdr->lseg->pls_layout);
+               ff_layout_reset_write(hdr, true);
+               return task->tk_status;
        case -NFS4ERR_RESET_TO_MDS:
-               inode = hdr->lseg->pls_layout->plh_inode;
-               pnfs_error_mark_layout_for_return(inode, hdr->lseg);
-               if (err == -NFS4ERR_RESET_TO_PNFS) {
-                       pnfs_set_retry_layoutget(hdr->lseg->pls_layout);
-                       ff_layout_reset_write(hdr, true);
-               } else {
-                       pnfs_clear_retry_layoutget(hdr->lseg->pls_layout);
-                       ff_layout_reset_write(hdr, false);
-               }
+               pnfs_clear_retry_layoutget(hdr->lseg->pls_layout);
+               ff_layout_reset_write(hdr, false);
                return task->tk_status;
        case -EAGAIN:
                rpc_restart_call_prepare(task);
@@ -1050,28 +1389,24 @@ static int ff_layout_write_done_cb(struct rpc_task *task,
 static int ff_layout_commit_done_cb(struct rpc_task *task,
                                     struct nfs_commit_data *data)
 {
-       struct inode *inode;
        int err;
 
        trace_nfs4_pnfs_commit_ds(data, task->tk_status);
-       if (task->tk_status == -ETIMEDOUT && !data->res.op_status)
-               data->res.op_status = NFS4ERR_NXIO;
-       if (task->tk_status < 0 && data->res.op_status)
+       if (task->tk_status < 0)
                ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index,
                                            data->args.offset, data->args.count,
-                                           data->res.op_status, OP_COMMIT);
+                                           data->res.op_status, OP_COMMIT,
+                                           task->tk_status);
        err = ff_layout_async_handle_error(task, NULL, data->ds_clp,
                                           data->lseg, data->ds_commit_index);
 
        switch (err) {
        case -NFS4ERR_RESET_TO_PNFS:
+               pnfs_set_retry_layoutget(data->lseg->pls_layout);
+               pnfs_generic_prepare_to_resend_writes(data);
+               return -EAGAIN;
        case -NFS4ERR_RESET_TO_MDS:
-               inode = data->lseg->pls_layout->plh_inode;
-               pnfs_error_mark_layout_for_return(inode, data->lseg);
-               if (err == -NFS4ERR_RESET_TO_PNFS)
-                       pnfs_set_retry_layoutget(data->lseg->pls_layout);
-               else
-                       pnfs_clear_retry_layoutget(data->lseg->pls_layout);
+               pnfs_clear_retry_layoutget(data->lseg->pls_layout);
                pnfs_generic_prepare_to_resend_writes(data);
                return -EAGAIN;
        case -EAGAIN:
@@ -1079,7 +1414,8 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
                return -EAGAIN;
        }
 
-       if (data->verf.committed == NFS_UNSTABLE)
+       if (data->verf.committed == NFS_UNSTABLE
+           && ff_layout_need_layoutcommit(data->lseg))
                pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb);
 
        return 0;
@@ -1088,6 +1424,11 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
 static int ff_layout_write_prepare_common(struct rpc_task *task,
                                          struct nfs_pgio_header *hdr)
 {
+       nfs4_ff_layout_stat_io_start_write(hdr->inode,
+                       FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+                       hdr->args.count,
+                       task->tk_start);
+
        if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
                rpc_exit(task, -EIO);
                return -EIO;
@@ -1121,15 +1462,15 @@ static void ff_layout_write_prepare_v4(struct rpc_task *task, void *data)
 {
        struct nfs_pgio_header *hdr = data;
 
-       if (ff_layout_write_prepare_common(task, hdr))
-               return;
-
        if (ff_layout_setup_sequence(hdr->ds_clp,
                                     &hdr->args.seq_args,
                                     &hdr->res.seq_res,
                                     task))
                return;
 
+       if (ff_layout_write_prepare_common(task, hdr))
+               return;
+
        if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
                        hdr->args.lock_context, FMODE_WRITE) == -EIO)
                rpc_exit(task, -EIO); /* lost lock, terminate I/O */
@@ -1139,6 +1480,11 @@ static void ff_layout_write_call_done(struct rpc_task *task, void *data)
 {
        struct nfs_pgio_header *hdr = data;
 
+       nfs4_ff_layout_stat_io_end_write(task,
+                       FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+                       hdr->args.count, hdr->res.count,
+                       hdr->res.verf->committed);
+
        if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
            task->tk_status == 0) {
                nfs4_sequence_done(task, &hdr->res.seq_res);
@@ -1157,8 +1503,17 @@ static void ff_layout_write_count_stats(struct rpc_task *task, void *data)
            &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]);
 }
 
+static void ff_layout_commit_prepare_common(struct rpc_task *task,
+               struct nfs_commit_data *cdata)
+{
+       nfs4_ff_layout_stat_io_start_write(cdata->inode,
+                       FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
+                       0, task->tk_start);
+}
+
 static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data)
 {
+       ff_layout_commit_prepare_common(task, data);
        rpc_call_start(task);
 }
 
@@ -1166,10 +1521,30 @@ static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data)
 {
        struct nfs_commit_data *wdata = data;
 
-       ff_layout_setup_sequence(wdata->ds_clp,
+       if (ff_layout_setup_sequence(wdata->ds_clp,
                                 &wdata->args.seq_args,
                                 &wdata->res.seq_res,
-                                task);
+                                task))
+               return;
+       ff_layout_commit_prepare_common(task, data);
+}
+
+static void ff_layout_commit_done(struct rpc_task *task, void *data)
+{
+       struct nfs_commit_data *cdata = data;
+       struct nfs_page *req;
+       __u64 count = 0;
+
+       if (task->tk_status == 0) {
+               list_for_each_entry(req, &cdata->pages, wb_list)
+                       count += req->wb_bytes;
+       }
+
+       nfs4_ff_layout_stat_io_end_write(task,
+                       FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
+                       count, count, NFS_FILE_SYNC);
+
+       pnfs_generic_write_commit_done(task, data);
 }
 
 static void ff_layout_commit_count_stats(struct rpc_task *task, void *data)
@@ -1210,14 +1585,14 @@ static const struct rpc_call_ops ff_layout_write_call_ops_v4 = {
 
 static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = {
        .rpc_call_prepare = ff_layout_commit_prepare_v3,
-       .rpc_call_done = pnfs_generic_write_commit_done,
+       .rpc_call_done = ff_layout_commit_done,
        .rpc_count_stats = ff_layout_commit_count_stats,
        .rpc_release = pnfs_generic_commit_release,
 };
 
 static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = {
        .rpc_call_prepare = ff_layout_commit_prepare_v4,
-       .rpc_call_done = pnfs_generic_write_commit_done,
+       .rpc_call_done = ff_layout_commit_done,
        .rpc_count_stats = ff_layout_commit_count_stats,
        .rpc_release = pnfs_generic_commit_release,
 };
@@ -1261,7 +1636,6 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
        fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
        if (fh)
                hdr->args.fh = fh;
-
        /*
         * Note that if we ever decide to split across DSes,
         * then we may need to handle dense-like offsets.
@@ -1390,6 +1764,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
        fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
        if (fh)
                data->args.fh = fh;
+
        return nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops,
                                   vers == 3 ? &ff_layout_commit_call_ops_v3 :
                                               &ff_layout_commit_call_ops_v4,
@@ -1484,15 +1859,250 @@ ff_layout_encode_layoutreturn(struct pnfs_layout_hdr *lo,
        start = xdr_reserve_space(xdr, 4);
        BUG_ON(!start);
 
-       if (ff_layout_encode_ioerr(flo, xdr, args))
-               goto out;
-
+       ff_layout_encode_ioerr(flo, xdr, args);
        ff_layout_encode_iostats(flo, xdr, args);
-out:
+
        *start = cpu_to_be32((xdr->p - start - 1) * 4);
        dprintk("%s: Return\n", __func__);
 }
 
+static int
+ff_layout_ntop4(const struct sockaddr *sap, char *buf, const size_t buflen)
+{
+       const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+
+       return snprintf(buf, buflen, "%pI4", &sin->sin_addr);
+}
+
+static size_t
+ff_layout_ntop6_noscopeid(const struct sockaddr *sap, char *buf,
+                         const int buflen)
+{
+       const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+       const struct in6_addr *addr = &sin6->sin6_addr;
+
+       /*
+        * RFC 4291, Section 2.2.2
+        *
+        * Shorthanded ANY address
+        */
+       if (ipv6_addr_any(addr))
+               return snprintf(buf, buflen, "::");
+
+       /*
+        * RFC 4291, Section 2.2.2
+        *
+        * Shorthanded loopback address
+        */
+       if (ipv6_addr_loopback(addr))
+               return snprintf(buf, buflen, "::1");
+
+       /*
+        * RFC 4291, Section 2.2.3
+        *
+        * Special presentation address format for mapped v4
+        * addresses.
+        */
+       if (ipv6_addr_v4mapped(addr))
+               return snprintf(buf, buflen, "::ffff:%pI4",
+                                       &addr->s6_addr32[3]);
+
+       /*
+        * RFC 4291, Section 2.2.1
+        */
+       return snprintf(buf, buflen, "%pI6c", addr);
+}
+
+/* Derived from rpc_sockaddr2uaddr */
+static void
+ff_layout_encode_netaddr(struct xdr_stream *xdr, struct nfs4_pnfs_ds_addr *da)
+{
+       struct sockaddr *sap = (struct sockaddr *)&da->da_addr;
+       char portbuf[RPCBIND_MAXUADDRPLEN];
+       char addrbuf[RPCBIND_MAXUADDRLEN];
+       char *netid;
+       unsigned short port;
+       int len, netid_len;
+       __be32 *p;
+
+       switch (sap->sa_family) {
+       case AF_INET:
+               if (ff_layout_ntop4(sap, addrbuf, sizeof(addrbuf)) == 0)
+                       return;
+               port = ntohs(((struct sockaddr_in *)sap)->sin_port);
+               netid = "tcp";
+               netid_len = 3;
+               break;
+       case AF_INET6:
+               if (ff_layout_ntop6_noscopeid(sap, addrbuf, sizeof(addrbuf)) == 0)
+                       return;
+               port = ntohs(((struct sockaddr_in6 *)sap)->sin6_port);
+               netid = "tcp6";
+               netid_len = 4;
+               break;
+       default:
+               /* we only support tcp and tcp6 */
+               WARN_ON_ONCE(1);
+               return;
+       }
+
+       snprintf(portbuf, sizeof(portbuf), ".%u.%u", port >> 8, port & 0xff);
+       len = strlcat(addrbuf, portbuf, sizeof(addrbuf));
+
+       p = xdr_reserve_space(xdr, 4 + netid_len);
+       xdr_encode_opaque(p, netid, netid_len);
+
+       p = xdr_reserve_space(xdr, 4 + len);
+       xdr_encode_opaque(p, addrbuf, len);
+}
+
+static void
+ff_layout_encode_nfstime(struct xdr_stream *xdr,
+                        ktime_t t)
+{
+       struct timespec64 ts;
+       __be32 *p;
+
+       p = xdr_reserve_space(xdr, 12);
+       ts = ktime_to_timespec64(t);
+       p = xdr_encode_hyper(p, ts.tv_sec);
+       *p++ = cpu_to_be32(ts.tv_nsec);
+}
+
+static void
+ff_layout_encode_io_latency(struct xdr_stream *xdr,
+                           struct nfs4_ff_io_stat *stat)
+{
+       __be32 *p;
+
+       p = xdr_reserve_space(xdr, 5 * 8);
+       p = xdr_encode_hyper(p, stat->ops_requested);
+       p = xdr_encode_hyper(p, stat->bytes_requested);
+       p = xdr_encode_hyper(p, stat->ops_completed);
+       p = xdr_encode_hyper(p, stat->bytes_completed);
+       p = xdr_encode_hyper(p, stat->bytes_not_delivered);
+       ff_layout_encode_nfstime(xdr, stat->total_busy_time);
+       ff_layout_encode_nfstime(xdr, stat->aggregate_completion_time);
+}
+
+static void
+ff_layout_encode_layoutstats(struct xdr_stream *xdr,
+                            struct nfs42_layoutstat_args *args,
+                            struct nfs42_layoutstat_devinfo *devinfo)
+{
+       struct nfs4_ff_layout_mirror *mirror = devinfo->layout_private;
+       struct nfs4_pnfs_ds_addr *da;
+       struct nfs4_pnfs_ds *ds = mirror->mirror_ds->ds;
+       struct nfs_fh *fh = &mirror->fh_versions[0];
+       __be32 *p, *start;
+
+       da = list_first_entry(&ds->ds_addrs, struct nfs4_pnfs_ds_addr, da_node);
+       dprintk("%s: DS %s: encoding address %s\n",
+               __func__, ds->ds_remotestr, da->da_remotestr);
+       /* layoutupdate length */
+       start = xdr_reserve_space(xdr, 4);
+       /* netaddr4 */
+       ff_layout_encode_netaddr(xdr, da);
+       /* nfs_fh4 */
+       p = xdr_reserve_space(xdr, 4 + fh->size);
+       xdr_encode_opaque(p, fh->data, fh->size);
+       /* ff_io_latency4 read */
+       spin_lock(&mirror->lock);
+       ff_layout_encode_io_latency(xdr, &mirror->read_stat.io_stat);
+       /* ff_io_latency4 write */
+       ff_layout_encode_io_latency(xdr, &mirror->write_stat.io_stat);
+       spin_unlock(&mirror->lock);
+       /* nfstime4 */
+       ff_layout_encode_nfstime(xdr, ktime_sub(ktime_get(), mirror->start_time));
+       /* bool */
+       p = xdr_reserve_space(xdr, 4);
+       *p = cpu_to_be32(false);
+
+       *start = cpu_to_be32((xdr->p - start - 1) * 4);
+}
+
+static int
+ff_layout_mirror_prepare_stats(struct nfs42_layoutstat_args *args,
+                              struct pnfs_layout_hdr *lo,
+                              int dev_limit)
+{
+       struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo);
+       struct nfs4_ff_layout_mirror *mirror;
+       struct nfs4_deviceid_node *dev;
+       struct nfs42_layoutstat_devinfo *devinfo;
+       int i = 0;
+
+       list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) {
+               if (i >= dev_limit)
+                       break;
+               if (!mirror->mirror_ds)
+                       continue;
+               /* mirror refcount put in cleanup_layoutstats */
+               if (!atomic_inc_not_zero(&mirror->ref))
+                       continue;
+               dev = &mirror->mirror_ds->id_node; 
+               devinfo = &args->devinfo[i];
+               memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE);
+               devinfo->offset = 0;
+               devinfo->length = NFS4_MAX_UINT64;
+               devinfo->read_count = mirror->read_stat.io_stat.ops_completed;
+               devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed;
+               devinfo->write_count = mirror->write_stat.io_stat.ops_completed;
+               devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed;
+               devinfo->layout_type = LAYOUT_FLEX_FILES;
+               devinfo->layoutstats_encode = ff_layout_encode_layoutstats;
+               devinfo->layout_private = mirror;
+
+               i++;
+       }
+       return i;
+}
+
+static int
+ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
+{
+       struct nfs4_flexfile_layout *ff_layout;
+       struct nfs4_ff_layout_mirror *mirror;
+       int dev_count = 0;
+
+       spin_lock(&args->inode->i_lock);
+       ff_layout = FF_LAYOUT_FROM_HDR(NFS_I(args->inode)->layout);
+       list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) {
+               if (atomic_read(&mirror->ref) != 0)
+                       dev_count ++;
+       }
+       spin_unlock(&args->inode->i_lock);
+       /* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */
+       if (dev_count > PNFS_LAYOUTSTATS_MAXDEV) {
+               dprintk("%s: truncating devinfo to limit (%d:%d)\n",
+                       __func__, dev_count, PNFS_LAYOUTSTATS_MAXDEV);
+               dev_count = PNFS_LAYOUTSTATS_MAXDEV;
+       }
+       args->devinfo = kmalloc_array(dev_count, sizeof(*args->devinfo), GFP_NOIO);
+       if (!args->devinfo)
+               return -ENOMEM;
+
+       spin_lock(&args->inode->i_lock);
+       args->num_dev = ff_layout_mirror_prepare_stats(args,
+                       &ff_layout->generic_hdr, dev_count);
+       spin_unlock(&args->inode->i_lock);
+
+       return 0;
+}
+
+static void
+ff_layout_cleanup_layoutstats(struct nfs42_layoutstat_data *data)
+{
+       struct nfs4_ff_layout_mirror *mirror;
+       int i;
+
+       for (i = 0; i < data->args.num_dev; i++) {
+               mirror = data->args.devinfo[i].layout_private;
+               data->args.devinfo[i].layout_private = NULL;
+               ff_layout_put_mirror(mirror);
+       }
+}
+
 static struct pnfs_layoutdriver_type flexfilelayout_type = {
        .id                     = LAYOUT_FLEX_FILES,
        .name                   = "LAYOUT_FLEX_FILES",
@@ -1501,6 +2111,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = {
        .free_layout_hdr        = ff_layout_free_layout_hdr,
        .alloc_lseg             = ff_layout_alloc_lseg,
        .free_lseg              = ff_layout_free_lseg,
+       .add_lseg               = ff_layout_add_lseg,
        .pg_read_ops            = &ff_layout_pg_read_ops,
        .pg_write_ops           = &ff_layout_pg_write_ops,
        .get_ds_info            = ff_layout_get_ds_info,
@@ -1515,6 +2126,8 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = {
        .alloc_deviceid_node    = ff_layout_alloc_deviceid_node,
        .encode_layoutreturn    = ff_layout_encode_layoutreturn,
        .sync                   = pnfs_nfs_generic_sync,
+       .prepare_layoutstats    = ff_layout_prepare_layoutstats,
+       .cleanup_layoutstats    = ff_layout_cleanup_layoutstats,
 };
 
 static int __init nfs4flexfilelayout_init(void)