]> git.openfabrics.org - ~shefty/rdma-dev.git/blobdiff - fs/ext3/inode.c
Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux...
[~shefty/rdma-dev.git] / fs / ext3 / inode.c
index 2978a2a17a59b8c0e34a381f3bba4ab1b059d396..04da6acde85dfbcfb50d89302c3f975baab3c76d 100644 (file)
 #include <linux/bio.h>
 #include <linux/fiemap.h>
 #include <linux/namei.h>
+#include <trace/events/ext3.h>
 #include "xattr.h"
 #include "acl.h"
 
 static int ext3_writepage_trans_blocks(struct inode *inode);
+static int ext3_block_truncate_page(struct inode *inode, loff_t from);
 
 /*
  * Test whether an inode is a fast symlink.
@@ -70,6 +72,7 @@ int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode,
 
        might_sleep();
 
+       trace_ext3_forget(inode, is_metadata, blocknr);
        BUFFER_TRACE(bh, "enter");
 
        jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
@@ -194,20 +197,47 @@ static int truncate_restart_transaction(handle_t *handle, struct inode *inode)
  */
 void ext3_evict_inode (struct inode *inode)
 {
+       struct ext3_inode_info *ei = EXT3_I(inode);
        struct ext3_block_alloc_info *rsv;
        handle_t *handle;
        int want_delete = 0;
 
+       trace_ext3_evict_inode(inode);
        if (!inode->i_nlink && !is_bad_inode(inode)) {
                dquot_initialize(inode);
                want_delete = 1;
        }
 
+       /*
+        * When journalling data dirty buffers are tracked only in the journal.
+        * So although mm thinks everything is clean and ready for reaping the
+        * inode might still have some pages to write in the running
+        * transaction or waiting to be checkpointed. Thus calling
+        * journal_invalidatepage() (via truncate_inode_pages()) to discard
+        * these buffers can cause data loss. Also even if we did not discard
+        * these buffers, we would have no way to find them after the inode
+        * is reaped and thus user could see stale data if he tries to read
+        * them before the transaction is checkpointed. So be careful and
+        * force everything to disk here... We use ei->i_datasync_tid to
+        * store the newest transaction containing inode's data.
+        *
+        * Note that directories do not have this problem because they don't
+        * use page cache.
+        */
+       if (inode->i_nlink && ext3_should_journal_data(inode) &&
+           (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) {
+               tid_t commit_tid = atomic_read(&ei->i_datasync_tid);
+               journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
+
+               log_start_commit(journal, commit_tid);
+               log_wait_commit(journal, commit_tid);
+               filemap_write_and_wait(&inode->i_data);
+       }
        truncate_inode_pages(&inode->i_data, 0);
 
        ext3_discard_reservation(inode);
-       rsv = EXT3_I(inode)->i_block_alloc_info;
-       EXT3_I(inode)->i_block_alloc_info = NULL;
+       rsv = ei->i_block_alloc_info;
+       ei->i_block_alloc_info = NULL;
        if (unlikely(rsv))
                kfree(rsv);
 
@@ -231,15 +261,13 @@ void ext3_evict_inode (struct inode *inode)
        if (inode->i_blocks)
                ext3_truncate(inode);
        /*
-        * Kill off the orphan record which ext3_truncate created.
-        * AKPM: I think this can be inside the above `if'.
-        * Note that ext3_orphan_del() has to be able to cope with the
-        * deletion of a non-existent orphan - this is because we don't
-        * know if ext3_truncate() actually created an orphan record.
-        * (Well, we could do this if we need to, but heck - it works)
+        * Kill off the orphan record created when the inode lost the last
+        * link.  Note that ext3_orphan_del() has to be able to cope with the
+        * deletion of a non-existent orphan - ext3_truncate() could
+        * have removed the record.
         */
        ext3_orphan_del(handle, inode);
-       EXT3_I(inode)->i_dtime  = get_seconds();
+       ei->i_dtime = get_seconds();
 
        /*
         * One subtle ordering requirement: if anything has gone wrong
@@ -842,6 +870,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
        ext3_fsblk_t first_block = 0;
 
 
+       trace_ext3_get_blocks_enter(inode, iblock, maxblocks, create);
        J_ASSERT(handle != NULL || create == 0);
        depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary);
 
@@ -886,6 +915,9 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
        if (!create || err == -EIO)
                goto cleanup;
 
+       /*
+        * Block out ext3_truncate while we alter the tree
+        */
        mutex_lock(&ei->truncate_mutex);
 
        /*
@@ -934,9 +966,6 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
         */
        count = ext3_blks_to_allocate(partial, indirect_blks,
                                        maxblocks, blocks_to_boundary);
-       /*
-        * Block out ext3_truncate while we alter the tree
-        */
        err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal,
                                offsets + (partial - chain), partial);
 
@@ -970,6 +999,9 @@ cleanup:
        }
        BUFFER_TRACE(bh_result, "returned");
 out:
+       trace_ext3_get_blocks_exit(inode, iblock,
+                                  depth ? le32_to_cpu(chain[depth-1].key) : 0,
+                                  count, err);
        return err;
 }
 
@@ -1202,6 +1234,16 @@ static void ext3_truncate_failed_write(struct inode *inode)
        ext3_truncate(inode);
 }
 
+/*
+ * Truncate blocks that were not used by direct IO write. We have to zero out
+ * the last file block as well because direct IO might have written to it.
+ */
+static void ext3_truncate_failed_direct_write(struct inode *inode)
+{
+       ext3_block_truncate_page(inode, inode->i_size);
+       ext3_truncate(inode);
+}
+
 static int ext3_write_begin(struct file *file, struct address_space *mapping,
                                loff_t pos, unsigned len, unsigned flags,
                                struct page **pagep, void **fsdata)
@@ -1217,6 +1259,8 @@ static int ext3_write_begin(struct file *file, struct address_space *mapping,
         * we allocate blocks but write fails for some reason */
        int needed_blocks = ext3_writepage_trans_blocks(inode) + 1;
 
+       trace_ext3_write_begin(inode, pos, len, flags);
+
        index = pos >> PAGE_CACHE_SHIFT;
        from = pos & (PAGE_CACHE_SIZE - 1);
        to = from + len;
@@ -1332,6 +1376,7 @@ static int ext3_ordered_write_end(struct file *file,
        unsigned from, to;
        int ret = 0, ret2;
 
+       trace_ext3_ordered_write_end(inode, pos, len, copied);
        copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
 
        from = pos & (PAGE_CACHE_SIZE - 1);
@@ -1367,6 +1412,7 @@ static int ext3_writeback_write_end(struct file *file,
        struct inode *inode = file->f_mapping->host;
        int ret;
 
+       trace_ext3_writeback_write_end(inode, pos, len, copied);
        copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
        update_file_sizes(inode, pos, copied);
        /*
@@ -1391,10 +1437,12 @@ static int ext3_journalled_write_end(struct file *file,
 {
        handle_t *handle = ext3_journal_current_handle();
        struct inode *inode = mapping->host;
+       struct ext3_inode_info *ei = EXT3_I(inode);
        int ret = 0, ret2;
        int partial = 0;
        unsigned from, to;
 
+       trace_ext3_journalled_write_end(inode, pos, len, copied);
        from = pos & (PAGE_CACHE_SIZE - 1);
        to = from + len;
 
@@ -1419,8 +1467,9 @@ static int ext3_journalled_write_end(struct file *file,
        if (pos + len > inode->i_size && ext3_can_truncate(inode))
                ext3_orphan_add(handle, inode);
        ext3_set_inode_state(inode, EXT3_STATE_JDATA);
-       if (inode->i_size > EXT3_I(inode)->i_disksize) {
-               EXT3_I(inode)->i_disksize = inode->i_size;
+       atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
+       if (inode->i_size > ei->i_disksize) {
+               ei->i_disksize = inode->i_size;
                ret2 = ext3_mark_inode_dirty(handle, inode);
                if (!ret)
                        ret = ret2;
@@ -1577,6 +1626,7 @@ static int ext3_ordered_writepage(struct page *page,
        if (ext3_journal_current_handle())
                goto out_fail;
 
+       trace_ext3_ordered_writepage(page);
        if (!page_has_buffers(page)) {
                create_empty_buffers(page, inode->i_sb->s_blocksize,
                                (1 << BH_Dirty)|(1 << BH_Uptodate));
@@ -1647,6 +1697,7 @@ static int ext3_writeback_writepage(struct page *page,
        if (ext3_journal_current_handle())
                goto out_fail;
 
+       trace_ext3_writeback_writepage(page);
        if (page_has_buffers(page)) {
                if (!walk_page_buffers(NULL, page_buffers(page), 0,
                                      PAGE_CACHE_SIZE, NULL, buffer_unmapped)) {
@@ -1689,6 +1740,7 @@ static int ext3_journalled_writepage(struct page *page,
        if (ext3_journal_current_handle())
                goto no_write;
 
+       trace_ext3_journalled_writepage(page);
        handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
@@ -1715,6 +1767,8 @@ static int ext3_journalled_writepage(struct page *page,
                if (ret == 0)
                        ret = err;
                ext3_set_inode_state(inode, EXT3_STATE_JDATA);
+               atomic_set(&EXT3_I(inode)->i_datasync_tid,
+                          handle->h_transaction->t_tid);
                unlock_page(page);
        } else {
                /*
@@ -1739,6 +1793,7 @@ out_unlock:
 
 static int ext3_readpage(struct file *file, struct page *page)
 {
+       trace_ext3_readpage(page);
        return mpage_readpage(page, ext3_get_block);
 }
 
@@ -1753,6 +1808,8 @@ static void ext3_invalidatepage(struct page *page, unsigned long offset)
 {
        journal_t *journal = EXT3_JOURNAL(page->mapping->host);
 
+       trace_ext3_invalidatepage(page, offset);
+
        /*
         * If it's a full truncate we just forget about the pending dirtying
         */
@@ -1766,6 +1823,7 @@ static int ext3_releasepage(struct page *page, gfp_t wait)
 {
        journal_t *journal = EXT3_JOURNAL(page->mapping->host);
 
+       trace_ext3_releasepage(page);
        WARN_ON(PageChecked(page));
        if (!page_has_buffers(page))
                return 0;
@@ -1794,6 +1852,8 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
        size_t count = iov_length(iov, nr_segs);
        int retries = 0;
 
+       trace_ext3_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
+
        if (rw == WRITE) {
                loff_t final_size = offset + count;
 
@@ -1827,7 +1887,7 @@ retry:
                loff_t end = offset + iov_length(iov, nr_segs);
 
                if (end > isize)
-                       vmtruncate(inode, isize);
+                       ext3_truncate_failed_direct_write(inode);
        }
        if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
                goto retry;
@@ -1841,7 +1901,7 @@ retry:
                        /* This is really bad luck. We've written the data
                         * but cannot extend i_size. Truncate allocated blocks
                         * and pretend the write failed... */
-                       ext3_truncate(inode);
+                       ext3_truncate_failed_direct_write(inode);
                        ret = PTR_ERR(handle);
                        goto out;
                }
@@ -1867,6 +1927,8 @@ retry:
                        ret = err;
        }
 out:
+       trace_ext3_direct_IO_exit(inode, offset,
+                               iov_length(iov, nr_segs), rw, ret);
        return ret;
 }
 
@@ -1949,17 +2011,24 @@ void ext3_set_aops(struct inode *inode)
  * This required during truncate. We need to physically zero the tail end
  * of that block so it doesn't yield old data if the file is later grown.
  */
-static int ext3_block_truncate_page(handle_t *handle, struct page *page,
-               struct address_space *mapping, loff_t from)
+static int ext3_block_truncate_page(struct inode *inode, loff_t from)
 {
        ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT;
-       unsigned offset = from & (PAGE_CACHE_SIZE-1);
+       unsigned offset = from & (PAGE_CACHE_SIZE - 1);
        unsigned blocksize, iblock, length, pos;
-       struct inode *inode = mapping->host;
+       struct page *page;
+       handle_t *handle = NULL;
        struct buffer_head *bh;
        int err = 0;
 
+       /* Truncated on block boundary - nothing to do */
        blocksize = inode->i_sb->s_blocksize;
+       if ((from & (blocksize - 1)) == 0)
+               return 0;
+
+       page = grab_cache_page(inode->i_mapping, index);
+       if (!page)
+               return -ENOMEM;
        length = blocksize - (offset & (blocksize - 1));
        iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
 
@@ -2004,11 +2073,23 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page,
                        goto unlock;
        }
 
+       /* data=writeback mode doesn't need transaction to zero-out data */
+       if (!ext3_should_writeback_data(inode)) {
+               /* We journal at most one block */
+               handle = ext3_journal_start(inode, 1);
+               if (IS_ERR(handle)) {
+                       clear_highpage(page);
+                       flush_dcache_page(page);
+                       err = PTR_ERR(handle);
+                       goto unlock;
+               }
+       }
+
        if (ext3_should_journal_data(inode)) {
                BUFFER_TRACE(bh, "get write access");
                err = ext3_journal_get_write_access(handle, bh);
                if (err)
-                       goto unlock;
+                       goto stop;
        }
 
        zero_user(page, offset, length);
@@ -2022,6 +2103,9 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page,
                        err = ext3_journal_dirty_data(handle, bh);
                mark_buffer_dirty(bh);
        }
+stop:
+       if (handle)
+               ext3_journal_stop(handle);
 
 unlock:
        unlock_page(page);
@@ -2390,8 +2474,6 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
 
 int ext3_can_truncate(struct inode *inode)
 {
-       if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
-               return 0;
        if (S_ISREG(inode->i_mode))
                return 1;
        if (S_ISDIR(inode->i_mode))
@@ -2435,7 +2517,6 @@ void ext3_truncate(struct inode *inode)
        struct ext3_inode_info *ei = EXT3_I(inode);
        __le32 *i_data = ei->i_data;
        int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
-       struct address_space *mapping = inode->i_mapping;
        int offsets[4];
        Indirect chain[4];
        Indirect *partial;
@@ -2443,7 +2524,8 @@ void ext3_truncate(struct inode *inode)
        int n;
        long last_block;
        unsigned blocksize = inode->i_sb->s_blocksize;
-       struct page *page;
+
+       trace_ext3_truncate_enter(inode);
 
        if (!ext3_can_truncate(inode))
                goto out_notrans;
@@ -2451,37 +2533,12 @@ void ext3_truncate(struct inode *inode)
        if (inode->i_size == 0 && ext3_should_writeback_data(inode))
                ext3_set_inode_state(inode, EXT3_STATE_FLUSH_ON_CLOSE);
 
-       /*
-        * We have to lock the EOF page here, because lock_page() nests
-        * outside journal_start().
-        */
-       if ((inode->i_size & (blocksize - 1)) == 0) {
-               /* Block boundary? Nothing to do */
-               page = NULL;
-       } else {
-               page = grab_cache_page(mapping,
-                               inode->i_size >> PAGE_CACHE_SHIFT);
-               if (!page)
-                       goto out_notrans;
-       }
-
        handle = start_transaction(inode);
-       if (IS_ERR(handle)) {
-               if (page) {
-                       clear_highpage(page);
-                       flush_dcache_page(page);
-                       unlock_page(page);
-                       page_cache_release(page);
-               }
+       if (IS_ERR(handle))
                goto out_notrans;
-       }
 
        last_block = (inode->i_size + blocksize-1)
                                        >> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
-
-       if (page)
-               ext3_block_truncate_page(handle, page, mapping, inode->i_size);
-
        n = ext3_block_to_path(inode, last_block, offsets, NULL);
        if (n == 0)
                goto out_stop;  /* error */
@@ -2596,6 +2653,7 @@ out_stop:
                ext3_orphan_del(handle, inode);
 
        ext3_journal_stop(handle);
+       trace_ext3_truncate_exit(inode);
        return;
 out_notrans:
        /*
@@ -2604,6 +2662,7 @@ out_notrans:
         */
        if (inode->i_nlink)
                ext3_orphan_del(NULL, inode);
+       trace_ext3_truncate_exit(inode);
 }
 
 static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb,
@@ -2745,6 +2804,7 @@ make_io:
                 * has in-inode xattrs, or we don't have this inode in memory.
                 * Read the block from disk.
                 */
+               trace_ext3_load_inode(inode);
                get_bh(bh);
                bh->b_end_io = end_buffer_read_sync;
                submit_bh(READ_META, bh);
@@ -3229,18 +3289,36 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
                }
 
                error = ext3_orphan_add(handle, inode);
+               if (error) {
+                       ext3_journal_stop(handle);
+                       goto err_out;
+               }
                EXT3_I(inode)->i_disksize = attr->ia_size;
-               rc = ext3_mark_inode_dirty(handle, inode);
-               if (!error)
-                       error = rc;
+               error = ext3_mark_inode_dirty(handle, inode);
                ext3_journal_stop(handle);
+               if (error) {
+                       /* Some hard fs error must have happened. Bail out. */
+                       ext3_orphan_del(NULL, inode);
+                       goto err_out;
+               }
+               rc = ext3_block_truncate_page(inode, attr->ia_size);
+               if (rc) {
+                       /* Cleanup orphan list and exit */
+                       handle = ext3_journal_start(inode, 3);
+                       if (IS_ERR(handle)) {
+                               ext3_orphan_del(NULL, inode);
+                               goto err_out;
+                       }
+                       ext3_orphan_del(handle, inode);
+                       ext3_journal_stop(handle);
+                       goto err_out;
+               }
        }
 
        if ((attr->ia_valid & ATTR_SIZE) &&
            attr->ia_size != i_size_read(inode)) {
-               rc = vmtruncate(inode, attr->ia_size);
-               if (rc)
-                       goto err_out;
+               truncate_setsize(inode, attr->ia_size);
+               ext3_truncate(inode);
        }
 
        setattr_copy(inode, attr);
@@ -3374,6 +3452,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
        int err;
 
        might_sleep();
+       trace_ext3_mark_inode_dirty(inode, _RET_IP_);
        err = ext3_reserve_inode_write(handle, inode, &iloc);
        if (!err)
                err = ext3_mark_iloc_dirty(handle, inode, &iloc);