These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / fs / jbd2 / transaction.c
index ff2f2e6..ca181e8 100644 (file)
@@ -204,6 +204,20 @@ static int add_transaction_credits(journal_t *journal, int blocks,
                 * attach this handle to a new transaction.
                 */
                atomic_sub(total, &t->t_outstanding_credits);
+
+               /*
+                * Is the number of reserved credits in the current transaction too
+                * big to fit this handle? Wait until reserved credits are freed.
+                */
+               if (atomic_read(&journal->j_reserved_credits) + total >
+                   journal->j_max_transaction_buffers) {
+                       read_unlock(&journal->j_state_lock);
+                       wait_event(journal->j_wait_reserved,
+                                  atomic_read(&journal->j_reserved_credits) + total <=
+                                  journal->j_max_transaction_buffers);
+                       return 1;
+               }
+
                wait_transaction_locked(journal);
                return 1;
        }
@@ -262,38 +276,36 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
        int             rsv_blocks = 0;
        unsigned long ts = jiffies;
 
+       if (handle->h_rsv_handle)
+               rsv_blocks = handle->h_rsv_handle->h_buffer_credits;
+
        /*
-        * 1/2 of transaction can be reserved so we can practically handle
-        * only 1/2 of maximum transaction size per operation
+        * Limit the number of reserved credits to 1/2 of maximum transaction
+        * size and limit the number of total credits to not exceed maximum
+        * transaction size per operation.
         */
-       if (WARN_ON(blocks > journal->j_max_transaction_buffers / 2)) {
-               printk(KERN_ERR "JBD2: %s wants too many credits (%d > %d)\n",
-                      current->comm, blocks,
-                      journal->j_max_transaction_buffers / 2);
+       if ((rsv_blocks > journal->j_max_transaction_buffers / 2) ||
+           (rsv_blocks + blocks > journal->j_max_transaction_buffers)) {
+               printk(KERN_ERR "JBD2: %s wants too many credits "
+                      "credits:%d rsv_credits:%d max:%d\n",
+                      current->comm, blocks, rsv_blocks,
+                      journal->j_max_transaction_buffers);
+               WARN_ON(1);
                return -ENOSPC;
        }
 
-       if (handle->h_rsv_handle)
-               rsv_blocks = handle->h_rsv_handle->h_buffer_credits;
-
 alloc_transaction:
        if (!journal->j_running_transaction) {
+               /*
+                * If __GFP_FS is not present, then we may be being called from
+                * inside the fs writeback layer, so we MUST NOT fail.
+                */
+               if ((gfp_mask & __GFP_FS) == 0)
+                       gfp_mask |= __GFP_NOFAIL;
                new_transaction = kmem_cache_zalloc(transaction_cache,
                                                    gfp_mask);
-               if (!new_transaction) {
-                       /*
-                        * If __GFP_FS is not present, then we may be
-                        * being called from inside the fs writeback
-                        * layer, so we MUST NOT fail.  Since
-                        * __GFP_NOFAIL is going away, we will arrange
-                        * to retry the allocation ourselves.
-                        */
-                       if ((gfp_mask & __GFP_FS) == 0) {
-                               congestion_wait(BLK_RW_ASYNC, HZ/50);
-                               goto alloc_transaction;
-                       }
+               if (!new_transaction)
                        return -ENOMEM;
-               }
        }
 
        jbd_debug(3, "New handle %p going live.\n", handle);
@@ -761,6 +773,30 @@ static void warn_dirty_buffer(struct buffer_head *bh)
               bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
 }
 
+/* Call t_frozen trigger and copy buffer data into jh->b_frozen_data. */
+static void jbd2_freeze_jh_data(struct journal_head *jh)
+{
+       struct page *page;
+       int offset;
+       char *source;
+       struct buffer_head *bh = jh2bh(jh);
+
+       J_EXPECT_JH(jh, buffer_uptodate(bh), "Possible IO failure.\n");
+       page = bh->b_page;
+       offset = offset_in_page(bh->b_data);
+       source = kmap_atomic(page);
+       /* Fire data frozen trigger just before we copy the data */
+       jbd2_buffer_frozen_trigger(jh, source + offset, jh->b_triggers);
+       memcpy(jh->b_frozen_data, source + offset, bh->b_size);
+       kunmap_atomic(source);
+
+       /*
+        * Now that the frozen data is saved off, we need to store any matching
+        * triggers.
+        */
+       jh->b_frozen_triggers = jh->b_triggers;
+}
+
 /*
  * If the buffer is already part of the current transaction, then there
  * is nothing we need to do.  If it is already part of a prior
@@ -780,7 +816,6 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
        journal_t *journal;
        int error;
        char *frozen_buffer = NULL;
-       int need_copy = 0;
        unsigned long start_lock, time_lock;
 
        if (is_handle_aborted(handle))
@@ -866,6 +901,26 @@ repeat:
         */
        jh->b_modified = 0;
 
+       /*
+        * If the buffer is not journaled right now, we need to make sure it
+        * doesn't get written to disk before the caller actually commits the
+        * new data
+        */
+       if (!jh->b_transaction) {
+               JBUFFER_TRACE(jh, "no transaction");
+               J_ASSERT_JH(jh, !jh->b_next_transaction);
+               JBUFFER_TRACE(jh, "file as BJ_Reserved");
+               /*
+                * Make sure all stores to jh (b_modified, b_frozen_data) are
+                * visible before attaching it to the running transaction.
+                * Paired with barrier in jbd2_write_access_granted()
+                */
+               smp_wmb();
+               spin_lock(&journal->j_list_lock);
+               __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
+               spin_unlock(&journal->j_list_lock);
+               goto done;
+       }
        /*
         * If there is already a copy-out version of this buffer, then we don't
         * need to make another one
@@ -873,113 +928,70 @@ repeat:
        if (jh->b_frozen_data) {
                JBUFFER_TRACE(jh, "has frozen data");
                J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
-               jh->b_next_transaction = transaction;
-               goto done;
+               goto attach_next;
        }
 
-       /* Is there data here we need to preserve? */
+       JBUFFER_TRACE(jh, "owned by older transaction");
+       J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+       J_ASSERT_JH(jh, jh->b_transaction == journal->j_committing_transaction);
 
-       if (jh->b_transaction && jh->b_transaction != transaction) {
-               JBUFFER_TRACE(jh, "owned by older transaction");
-               J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
-               J_ASSERT_JH(jh, jh->b_transaction ==
-                                       journal->j_committing_transaction);
+       /*
+        * There is one case we have to be very careful about.  If the
+        * committing transaction is currently writing this buffer out to disk
+        * and has NOT made a copy-out, then we cannot modify the buffer
+        * contents at all right now.  The essence of copy-out is that it is
+        * the extra copy, not the primary copy, which gets journaled.  If the
+        * primary copy is already going to disk then we cannot do copy-out
+        * here.
+        */
+       if (buffer_shadow(bh)) {
+               JBUFFER_TRACE(jh, "on shadow: sleep");
+               jbd_unlock_bh_state(bh);
+               wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE);
+               goto repeat;
+       }
 
-               /* There is one case we have to be very careful about.
-                * If the committing transaction is currently writing
-                * this buffer out to disk and has NOT made a copy-out,
-                * then we cannot modify the buffer contents at all
-                * right now.  The essence of copy-out is that it is the
-                * extra copy, not the primary copy, which gets
-                * journaled.  If the primary copy is already going to
-                * disk then we cannot do copy-out here. */
-
-               if (buffer_shadow(bh)) {
-                       JBUFFER_TRACE(jh, "on shadow: sleep");
+       /*
+        * Only do the copy if the currently-owning transaction still needs it.
+        * If buffer isn't on BJ_Metadata list, the committing transaction is
+        * past that stage (here we use the fact that BH_Shadow is set under
+        * bh_state lock together with refiling to BJ_Shadow list and at this
+        * point we know the buffer doesn't have BH_Shadow set).
+        *
+        * Subtle point, though: if this is a get_undo_access, then we will be
+        * relying on the frozen_data to contain the new value of the
+        * committed_data record after the transaction, so we HAVE to force the
+        * frozen_data copy in that case.
+        */
+       if (jh->b_jlist == BJ_Metadata || force_copy) {
+               JBUFFER_TRACE(jh, "generate frozen data");
+               if (!frozen_buffer) {
+                       JBUFFER_TRACE(jh, "allocate memory for buffer");
                        jbd_unlock_bh_state(bh);
-                       wait_on_bit_io(&bh->b_state, BH_Shadow,
-                                      TASK_UNINTERRUPTIBLE);
-                       goto repeat;
-               }
-
-               /*
-                * Only do the copy if the currently-owning transaction still
-                * needs it. If buffer isn't on BJ_Metadata list, the
-                * committing transaction is past that stage (here we use the
-                * fact that BH_Shadow is set under bh_state lock together with
-                * refiling to BJ_Shadow list and at this point we know the
-                * buffer doesn't have BH_Shadow set).
-                *
-                * Subtle point, though: if this is a get_undo_access,
-                * then we will be relying on the frozen_data to contain
-                * the new value of the committed_data record after the
-                * transaction, so we HAVE to force the frozen_data copy
-                * in that case.
-                */
-               if (jh->b_jlist == BJ_Metadata || force_copy) {
-                       JBUFFER_TRACE(jh, "generate frozen data");
+                       frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS);
                        if (!frozen_buffer) {
-                               JBUFFER_TRACE(jh, "allocate memory for buffer");
-                               jbd_unlock_bh_state(bh);
-                               frozen_buffer =
-                                       jbd2_alloc(jh2bh(jh)->b_size,
-                                                        GFP_NOFS);
-                               if (!frozen_buffer) {
-                                       printk(KERN_ERR
-                                              "%s: OOM for frozen_buffer\n",
-                                              __func__);
-                                       JBUFFER_TRACE(jh, "oom!");
-                                       error = -ENOMEM;
-                                       jbd_lock_bh_state(bh);
-                                       goto done;
-                               }
-                               goto repeat;
+                               printk(KERN_ERR "%s: OOM for frozen_buffer\n",
+                                      __func__);
+                               JBUFFER_TRACE(jh, "oom!");
+                               error = -ENOMEM;
+                               goto out;
                        }
-                       jh->b_frozen_data = frozen_buffer;
-                       frozen_buffer = NULL;
-                       need_copy = 1;
+                       goto repeat;
                }
-               jh->b_next_transaction = transaction;
+               jh->b_frozen_data = frozen_buffer;
+               frozen_buffer = NULL;
+               jbd2_freeze_jh_data(jh);
        }
-
-
+attach_next:
        /*
-        * Finally, if the buffer is not journaled right now, we need to make
-        * sure it doesn't get written to disk before the caller actually
-        * commits the new data
+        * Make sure all stores to jh (b_modified, b_frozen_data) are visible
+        * before attaching it to the running transaction. Paired with barrier
+        * in jbd2_write_access_granted()
         */
-       if (!jh->b_transaction) {
-               JBUFFER_TRACE(jh, "no transaction");
-               J_ASSERT_JH(jh, !jh->b_next_transaction);
-               JBUFFER_TRACE(jh, "file as BJ_Reserved");
-               spin_lock(&journal->j_list_lock);
-               __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
-               spin_unlock(&journal->j_list_lock);
-       }
+       smp_wmb();
+       jh->b_next_transaction = transaction;
 
 done:
-       if (need_copy) {
-               struct page *page;
-               int offset;
-               char *source;
-
-               J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
-                           "Possible IO failure.\n");
-               page = jh2bh(jh)->b_page;
-               offset = offset_in_page(jh2bh(jh)->b_data);
-               source = kmap_atomic(page);
-               /* Fire data frozen trigger just before we copy the data */
-               jbd2_buffer_frozen_trigger(jh, source + offset,
-                                          jh->b_triggers);
-               memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
-               kunmap_atomic(source);
-
-               /*
-                * Now that the frozen data is saved off, we need to store
-                * any matching triggers.
-                */
-               jh->b_frozen_triggers = jh->b_triggers;
-       }
        jbd_unlock_bh_state(bh);
 
        /*
@@ -996,6 +1008,59 @@ out:
        return error;
 }
 
+/* Fast check whether buffer is already attached to the required transaction */
+static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh,
+                                                       bool undo)
+{
+       struct journal_head *jh;
+       bool ret = false;
+
+       /* Dirty buffers require special handling... */
+       if (buffer_dirty(bh))
+               return false;
+
+       /*
+        * RCU protects us from dereferencing freed pages. So the checks we do
+        * are guaranteed not to oops. However the jh slab object can get freed
+        * & reallocated while we work with it. So we have to be careful. When
+        * we see jh attached to the running transaction, we know it must stay
+        * so until the transaction is committed. Thus jh won't be freed and
+        * will be attached to the same bh while we run.  However it can
+        * happen jh gets freed, reallocated, and attached to the transaction
+        * just after we get pointer to it from bh. So we have to be careful
+        * and recheck jh still belongs to our bh before we return success.
+        */
+       rcu_read_lock();
+       if (!buffer_jbd(bh))
+               goto out;
+       /* This should be bh2jh() but that doesn't work with inline functions */
+       jh = READ_ONCE(bh->b_private);
+       if (!jh)
+               goto out;
+       /* For undo access buffer must have data copied */
+       if (undo && !jh->b_committed_data)
+               goto out;
+       if (jh->b_transaction != handle->h_transaction &&
+           jh->b_next_transaction != handle->h_transaction)
+               goto out;
+       /*
+        * There are two reasons for the barrier here:
+        * 1) Make sure to fetch b_bh after we did previous checks so that we
+        * detect when jh went through free, realloc, attach to transaction
+        * while we were checking. Paired with implicit barrier in that path.
+        * 2) So that access to bh done after jbd2_write_access_granted()
+        * doesn't get reordered and see inconsistent state of concurrent
+        * do_get_write_access().
+        */
+       smp_mb();
+       if (unlikely(jh->b_bh != bh))
+               goto out;
+       ret = true;
+out:
+       rcu_read_unlock();
+       return ret;
+}
+
 /**
  * int jbd2_journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
  * @handle: transaction to add buffer modifications to
@@ -1009,9 +1074,13 @@ out:
 
 int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
 {
-       struct journal_head *jh = jbd2_journal_add_journal_head(bh);
+       struct journal_head *jh;
        int rc;
 
+       if (jbd2_write_access_granted(handle, bh, false))
+               return 0;
+
+       jh = jbd2_journal_add_journal_head(bh);
        /* We do not want to get caught playing with fields which the
         * log thread also manipulates.  Make sure that the buffer
         * completes any outstanding IO before proceeding. */
@@ -1141,11 +1210,14 @@ out:
 int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
 {
        int err;
-       struct journal_head *jh = jbd2_journal_add_journal_head(bh);
+       struct journal_head *jh;
        char *committed_data = NULL;
 
        JBUFFER_TRACE(jh, "entry");
+       if (jbd2_write_access_granted(handle, bh, true))
+               return 0;
 
+       jh = jbd2_journal_add_journal_head(bh);
        /*
         * Do this first --- it can drop the journal lock, so we want to
         * make sure that obtaining the committed_data is done
@@ -1230,8 +1302,6 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh,
        triggers->t_abort(triggers, jh2bh(jh));
 }
 
-
-
 /**
  * int jbd2_journal_dirty_metadata() -  mark a buffer as containing dirty metadata
  * @handle: transaction to add buffer to.
@@ -1264,12 +1334,41 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 
        if (is_handle_aborted(handle))
                return -EROFS;
-       journal = transaction->t_journal;
-       jh = jbd2_journal_grab_journal_head(bh);
-       if (!jh) {
+       if (!buffer_jbd(bh)) {
                ret = -EUCLEAN;
                goto out;
        }
+       /*
+        * We don't grab jh reference here since the buffer must be part
+        * of the running transaction.
+        */
+       jh = bh2jh(bh);
+       /*
+        * This and the following assertions are unreliable since we may see jh
+        * in inconsistent state unless we grab bh_state lock. But this is
+        * crucial to catch bugs so let's do a reliable check until the
+        * lockless handling is fully proven.
+        */
+       if (jh->b_transaction != transaction &&
+           jh->b_next_transaction != transaction) {
+               jbd_lock_bh_state(bh);
+               J_ASSERT_JH(jh, jh->b_transaction == transaction ||
+                               jh->b_next_transaction == transaction);
+               jbd_unlock_bh_state(bh);
+       }
+       if (jh->b_modified == 1) {
+               /* If it's in our transaction it must be in BJ_Metadata list. */
+               if (jh->b_transaction == transaction &&
+                   jh->b_jlist != BJ_Metadata) {
+                       jbd_lock_bh_state(bh);
+                       J_ASSERT_JH(jh, jh->b_transaction != transaction ||
+                                       jh->b_jlist == BJ_Metadata);
+                       jbd_unlock_bh_state(bh);
+               }
+               goto out;
+       }
+
+       journal = transaction->t_journal;
        jbd_debug(5, "journal_head %p\n", jh);
        JBUFFER_TRACE(jh, "entry");
 
@@ -1360,7 +1459,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
        spin_unlock(&journal->j_list_lock);
 out_unlock_bh:
        jbd_unlock_bh_state(bh);
-       jbd2_journal_put_journal_head(jh);
 out:
        JBUFFER_TRACE(jh, "exit");
        return ret;
@@ -1843,8 +1941,8 @@ out:
  * @journal: journal for operation
  * @page: to try and free
  * @gfp_mask: we use the mask to detect how hard should we try to release
- * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
- * release the buffers.
+ * buffers. If __GFP_DIRECT_RECLAIM and __GFP_FS is set, we wait for commit
+ * code to release the buffers.
  *
  *
  * For all the buffers on this page,
@@ -2058,6 +2156,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
 
                if (!buffer_dirty(bh)) {
                        /* bdflush has written it.  We can drop it now */
+                       __jbd2_journal_remove_checkpoint(jh);
                        goto zap_buffer;
                }
 
@@ -2087,6 +2186,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
                                /* The orphan record's transaction has
                                 * committed.  We can cleanse this buffer */
                                clear_buffer_jbddirty(bh);
+                               __jbd2_journal_remove_checkpoint(jh);
                                goto zap_buffer;
                        }
                }