Merge branch 'for-linus' of git://oss.sgi.com/xfs/xfs
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 23 Mar 2012 16:19:22 +0000 (09:19 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 23 Mar 2012 16:19:22 +0000 (09:19 -0700)
Pull XFS updates from Ben Myers:
 "Scalability improvements for dquots, log grant code cleanups, plus
  bugfixes and cleanups large and small"

Fix up various trivial conflicts that were due to some of the earlier
patches already having been integrated into v3.3 as bugfixes, and then
there were development patches on top of those.  Easily merged by just
taking the newer version from the pulled branch.

* 'for-linus' of git://oss.sgi.com/xfs/xfs: (45 commits)
  xfs: fallback to vmalloc for large buffers in xfs_getbmap
  xfs: fallback to vmalloc for large buffers in xfs_attrmulti_attr_get
  xfs: remove remaining scraps of struct xfs_iomap
  xfs: fix inode lookup race
  xfs: clean up minor sparse warnings
  xfs: remove the global xfs_Gqm structure
  xfs: remove the per-filesystem list of dquots
  xfs: use per-filesystem radix trees for dquot lookup
  xfs: per-filesystem dquot LRU lists
  xfs: use common code for quota statistics
  xfs: reimplement fdatasync support
  xfs: split in-core and on-disk inode log item fields
  xfs: make xfs_inode_item_size idempotent
  xfs: log timestamp updates
  xfs: log file size updates at I/O completion time
  xfs: log file size updates as part of unwritten extent conversion
  xfs: do not require an ioend for new EOF calculation
  xfs: use per-filesystem I/O completion workqueues
  quota: make Q_XQUOTASYNC a noop
  xfs: include reservations in quota reporting
  ...

51 files changed:
fs/quota/quota.c
fs/xfs/Makefile
fs/xfs/xfs_aops.c
fs/xfs/xfs_aops.h
fs/xfs/xfs_bmap.c
fs/xfs/xfs_buf.c
fs/xfs/xfs_dfrag.c
fs/xfs/xfs_dir2_block.c
fs/xfs/xfs_dquot.c
fs/xfs/xfs_dquot.h
fs/xfs/xfs_file.c
fs/xfs/xfs_iget.c
fs/xfs/xfs_inode.c
fs/xfs/xfs_inode.h
fs/xfs/xfs_inode_item.c
fs/xfs/xfs_inode_item.h
fs/xfs/xfs_ioctl.c
fs/xfs/xfs_ioctl32.c
fs/xfs/xfs_iomap.c
fs/xfs/xfs_iops.c
fs/xfs/xfs_itable.c
fs/xfs/xfs_log.c
fs/xfs/xfs_log.h
fs/xfs/xfs_log_priv.h
fs/xfs/xfs_log_recover.c
fs/xfs/xfs_mount.c
fs/xfs/xfs_mount.h
fs/xfs/xfs_qm.c
fs/xfs/xfs_qm.h
fs/xfs/xfs_qm_bhv.c
fs/xfs/xfs_qm_stats.c [deleted file]
fs/xfs/xfs_qm_stats.h [deleted file]
fs/xfs/xfs_qm_syscalls.c
fs/xfs/xfs_quota.h
fs/xfs/xfs_quota_priv.h
fs/xfs/xfs_sb.h
fs/xfs/xfs_stats.c
fs/xfs/xfs_stats.h
fs/xfs/xfs_super.c
fs/xfs/xfs_super.h
fs/xfs/xfs_sync.c
fs/xfs/xfs_sync.h
fs/xfs/xfs_trace.h
fs/xfs/xfs_trans.c
fs/xfs/xfs_trans_ail.c
fs/xfs/xfs_trans_buf.c
fs/xfs/xfs_trans_dquot.c
fs/xfs/xfs_trans_inode.c
fs/xfs/xfs_trans_priv.h
fs/xfs/xfs_vnode.h
fs/xfs/xfs_vnodeops.h

index fc2c4388d1262a1771d27acb7c55507369cf0dba..9a391204ca278e4186300215ef521183af70eb68 100644 (file)
@@ -282,10 +282,9 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
        case Q_XGETQUOTA:
                return quota_getxquota(sb, type, id, addr);
        case Q_XQUOTASYNC:
-               /* caller already holds s_umount */
                if (sb->s_flags & MS_RDONLY)
                        return -EROFS;
-               writeback_inodes_sb(sb, WB_REASON_SYNC);
+               /* XFS quotas are fully coherent now, making this call a noop */
                return 0;
        default:
                return -EINVAL;
index 427a4e82a588759dbfb49394f73eca9400d455e7..0a9977983f92b358989f3cb01e81cb0e31db27e7 100644 (file)
@@ -96,9 +96,6 @@ xfs-$(CONFIG_XFS_QUOTA)               += xfs_dquot.o \
                                   xfs_qm_bhv.o \
                                   xfs_qm.o \
                                   xfs_quotaops.o
-ifeq ($(CONFIG_XFS_QUOTA),y)
-xfs-$(CONFIG_PROC_FS)          += xfs_qm_stats.o
-endif
 xfs-$(CONFIG_XFS_RT)           += xfs_rtalloc.o
 xfs-$(CONFIG_XFS_POSIX_ACL)    += xfs_acl.o
 xfs-$(CONFIG_PROC_FS)          += xfs_stats.o
index 74b9baf36ac39038f827c8e262ea62aa33d81de8..0dbb9e70fe21664740bc90721f4ae7bb5d3d190b 100644 (file)
@@ -26,6 +26,7 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_inode_item.h"
 #include "xfs_alloc.h"
 #include "xfs_error.h"
 #include "xfs_rw.h"
@@ -98,23 +99,6 @@ xfs_destroy_ioend(
        mempool_free(ioend, xfs_ioend_pool);
 }
 
-/*
- * If the end of the current ioend is beyond the current EOF,
- * return the new EOF value, otherwise zero.
- */
-STATIC xfs_fsize_t
-xfs_ioend_new_eof(
-       xfs_ioend_t             *ioend)
-{
-       xfs_inode_t             *ip = XFS_I(ioend->io_inode);
-       xfs_fsize_t             isize;
-       xfs_fsize_t             bsize;
-
-       bsize = ioend->io_offset + ioend->io_size;
-       isize = MIN(i_size_read(VFS_I(ip)), bsize);
-       return isize > ip->i_d.di_size ? isize : 0;
-}
-
 /*
  * Fast and loose check if this write could update the on-disk inode size.
  */
@@ -124,32 +108,65 @@ static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
                XFS_I(ioend->io_inode)->i_d.di_size;
 }
 
+STATIC int
+xfs_setfilesize_trans_alloc(
+       struct xfs_ioend        *ioend)
+{
+       struct xfs_mount        *mp = XFS_I(ioend->io_inode)->i_mount;
+       struct xfs_trans        *tp;
+       int                     error;
+
+       tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+
+       error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
+       if (error) {
+               xfs_trans_cancel(tp, 0);
+               return error;
+       }
+
+       ioend->io_append_trans = tp;
+
+       /*
+        * We hand off the transaction to the completion thread now, so
+        * clear the flag here.
+        */
+       current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+       return 0;
+}
+
 /*
  * Update on-disk file size now that data has been written to disk.
- *
- * This function does not block as blocking on the inode lock in IO completion
- * can lead to IO completion order dependency deadlocks.. If it can't get the
- * inode ilock it will return EAGAIN. Callers must handle this.
  */
 STATIC int
 xfs_setfilesize(
-       xfs_ioend_t             *ioend)
+       struct xfs_ioend        *ioend)
 {
-       xfs_inode_t             *ip = XFS_I(ioend->io_inode);
+       struct xfs_inode        *ip = XFS_I(ioend->io_inode);
+       struct xfs_trans        *tp = ioend->io_append_trans;
        xfs_fsize_t             isize;
 
-       if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
-               return EAGAIN;
+       /*
+        * The transaction was allocated in the I/O submission thread,
+        * thus we need to mark ourselves as beeing in a transaction
+        * manually.
+        */
+       current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
 
-       isize = xfs_ioend_new_eof(ioend);
-       if (isize) {
-               trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
-               ip->i_d.di_size = isize;
-               xfs_mark_inode_dirty(ip);
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
+       isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size);
+       if (!isize) {
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+               xfs_trans_cancel(tp, 0);
+               return 0;
        }
 
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       return 0;
+       trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
+
+       ip->i_d.di_size = isize;
+       xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+       return xfs_trans_commit(tp, 0);
 }
 
 /*
@@ -163,10 +180,12 @@ xfs_finish_ioend(
        struct xfs_ioend        *ioend)
 {
        if (atomic_dec_and_test(&ioend->io_remaining)) {
+               struct xfs_mount        *mp = XFS_I(ioend->io_inode)->i_mount;
+
                if (ioend->io_type == IO_UNWRITTEN)
-                       queue_work(xfsconvertd_workqueue, &ioend->io_work);
-               else if (xfs_ioend_is_append(ioend))
-                       queue_work(xfsdatad_workqueue, &ioend->io_work);
+                       queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
+               else if (ioend->io_append_trans)
+                       queue_work(mp->m_data_workqueue, &ioend->io_work);
                else
                        xfs_destroy_ioend(ioend);
        }
@@ -195,35 +214,36 @@ xfs_end_io(
         * range to normal written extens after the data I/O has finished.
         */
        if (ioend->io_type == IO_UNWRITTEN) {
+               /*
+                * For buffered I/O we never preallocate a transaction when
+                * doing the unwritten extent conversion, but for direct I/O
+                * we do not know if we are converting an unwritten extent
+                * or not at the point where we preallocate the transaction.
+                */
+               if (ioend->io_append_trans) {
+                       ASSERT(ioend->io_isdirect);
+
+                       current_set_flags_nested(
+                               &ioend->io_append_trans->t_pflags, PF_FSTRANS);
+                       xfs_trans_cancel(ioend->io_append_trans, 0);
+               }
+
                error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
                                                 ioend->io_size);
                if (error) {
                        ioend->io_error = -error;
                        goto done;
                }
+       } else if (ioend->io_append_trans) {
+               error = xfs_setfilesize(ioend);
+               if (error)
+                       ioend->io_error = -error;
+       } else {
+               ASSERT(!xfs_ioend_is_append(ioend));
        }
 
-       /*
-        * We might have to update the on-disk file size after extending
-        * writes.
-        */
-       error = xfs_setfilesize(ioend);
-       ASSERT(!error || error == EAGAIN);
-
 done:
-       /*
-        * If we didn't complete processing of the ioend, requeue it to the
-        * tail of the workqueue for another attempt later. Otherwise destroy
-        * it.
-        */
-       if (error == EAGAIN) {
-               atomic_inc(&ioend->io_remaining);
-               xfs_finish_ioend(ioend);
-               /* ensure we don't spin on blocked ioends */
-               delay(1);
-       } else {
-               xfs_destroy_ioend(ioend);
-       }
+       xfs_destroy_ioend(ioend);
 }
 
 /*
@@ -259,6 +279,7 @@ xfs_alloc_ioend(
         */
        atomic_set(&ioend->io_remaining, 1);
        ioend->io_isasync = 0;
+       ioend->io_isdirect = 0;
        ioend->io_error = 0;
        ioend->io_list = NULL;
        ioend->io_type = type;
@@ -269,6 +290,7 @@ xfs_alloc_ioend(
        ioend->io_size = 0;
        ioend->io_iocb = NULL;
        ioend->io_result = 0;
+       ioend->io_append_trans = NULL;
 
        INIT_WORK(&ioend->io_work, xfs_end_io);
        return ioend;
@@ -379,14 +401,6 @@ xfs_submit_ioend_bio(
        atomic_inc(&ioend->io_remaining);
        bio->bi_private = ioend;
        bio->bi_end_io = xfs_end_bio;
-
-       /*
-        * If the I/O is beyond EOF we mark the inode dirty immediately
-        * but don't update the inode size until I/O completion.
-        */
-       if (xfs_ioend_new_eof(ioend))
-               xfs_mark_inode_dirty(XFS_I(ioend->io_inode));
-
        submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
 }
 
@@ -1033,8 +1047,20 @@ xfs_vm_writepage(
                                  wbc, end_index);
        }
 
-       if (iohead)
+       if (iohead) {
+               /*
+                * Reserve log space if we might write beyond the on-disk
+                * inode size.
+                */
+               if (ioend->io_type != IO_UNWRITTEN &&
+                   xfs_ioend_is_append(ioend)) {
+                       err = xfs_setfilesize_trans_alloc(ioend);
+                       if (err)
+                               goto error;
+               }
+
                xfs_submit_ioend(wbc, iohead);
+       }
 
        return 0;
 
@@ -1314,17 +1340,32 @@ xfs_vm_direct_IO(
 {
        struct inode            *inode = iocb->ki_filp->f_mapping->host;
        struct block_device     *bdev = xfs_find_bdev_for_inode(inode);
+       struct xfs_ioend        *ioend = NULL;
        ssize_t                 ret;
 
        if (rw & WRITE) {
-               iocb->private = xfs_alloc_ioend(inode, IO_DIRECT);
+               size_t size = iov_length(iov, nr_segs);
+
+               /*
+                * We need to preallocate a transaction for a size update
+                * here.  In the case that this write both updates the size
+                * and converts at least on unwritten extent we will cancel
+                * the still clean transaction after the I/O has finished.
+                */
+               iocb->private = ioend = xfs_alloc_ioend(inode, IO_DIRECT);
+               if (offset + size > XFS_I(inode)->i_d.di_size) {
+                       ret = xfs_setfilesize_trans_alloc(ioend);
+                       if (ret)
+                               goto out_destroy_ioend;
+                       ioend->io_isdirect = 1;
+               }
 
                ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
                                            offset, nr_segs,
                                            xfs_get_blocks_direct,
                                            xfs_end_io_direct_write, NULL, 0);
                if (ret != -EIOCBQUEUED && iocb->private)
-                       xfs_destroy_ioend(iocb->private);
+                       goto out_trans_cancel;
        } else {
                ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
                                            offset, nr_segs,
@@ -1333,6 +1374,16 @@ xfs_vm_direct_IO(
        }
 
        return ret;
+
+out_trans_cancel:
+       if (ioend->io_append_trans) {
+               current_set_flags_nested(&ioend->io_append_trans->t_pflags,
+                                        PF_FSTRANS);
+               xfs_trans_cancel(ioend->io_append_trans, 0);
+       }
+out_destroy_ioend:
+       xfs_destroy_ioend(ioend);
+       return ret;
 }
 
 STATIC void
index 116dd5c370346eb118baa34ce19d8af8ef195b7e..84eafbcb0d9dd65cecd57ef9074eb66b89bb9c08 100644 (file)
@@ -18,8 +18,6 @@
 #ifndef __XFS_AOPS_H__
 #define __XFS_AOPS_H__
 
-extern struct workqueue_struct *xfsdatad_workqueue;
-extern struct workqueue_struct *xfsconvertd_workqueue;
 extern mempool_t *xfs_ioend_pool;
 
 /*
@@ -48,12 +46,14 @@ typedef struct xfs_ioend {
        int                     io_error;       /* I/O error code */
        atomic_t                io_remaining;   /* hold count */
        unsigned int            io_isasync : 1; /* needs aio_complete */
+       unsigned int            io_isdirect : 1;/* direct I/O */
        struct inode            *io_inode;      /* file being written to */
        struct buffer_head      *io_buffer_head;/* buffer linked list head */
        struct buffer_head      *io_buffer_tail;/* buffer linked list tail */
        size_t                  io_size;        /* size of the extent */
        xfs_off_t               io_offset;      /* offset in the file */
        struct work_struct      io_work;        /* xfsdatad work queue */
+       struct xfs_trans        *io_append_trans;/* xact. for size update */
        struct kiocb            *io_iocb;
        int                     io_result;
 } xfs_ioend_t;
index 188ef2fbd62880614a29ea0432e20707d5cf45a2..3548c6f75593d1d1f3acd949ea2677f072f19e2d 100644 (file)
@@ -5536,8 +5536,12 @@ xfs_getbmap(
        if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
                return XFS_ERROR(ENOMEM);
        out = kmem_zalloc(bmv->bmv_count * sizeof(struct getbmapx), KM_MAYFAIL);
-       if (!out)
-               return XFS_ERROR(ENOMEM);
+       if (!out) {
+               out = kmem_zalloc_large(bmv->bmv_count *
+                                       sizeof(struct getbmapx));
+               if (!out)
+                       return XFS_ERROR(ENOMEM);
+       }
 
        xfs_ilock(ip, XFS_IOLOCK_SHARED);
        if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
@@ -5661,7 +5665,10 @@ xfs_getbmap(
                        break;
        }
 
-       kmem_free(out);
+       if (is_vmalloc_addr(out))
+               kmem_free_large(out);
+       else
+               kmem_free(out);
        return error;
 }
 
index 4dff85c7d7eb1feda7999ff05f9941f6c42077ff..6819b5163e337f0762351d59f408c0c4156c2544 100644 (file)
@@ -45,8 +45,6 @@ static kmem_zone_t *xfs_buf_zone;
 STATIC int xfsbufd(void *);
 
 static struct workqueue_struct *xfslogd_workqueue;
-struct workqueue_struct *xfsdatad_workqueue;
-struct workqueue_struct *xfsconvertd_workqueue;
 
 #ifdef XFS_BUF_LOCK_TRACKING
 # define XB_SET_OWNER(bp)      ((bp)->b_last_holder = current->pid)
@@ -1793,21 +1791,8 @@ xfs_buf_init(void)
        if (!xfslogd_workqueue)
                goto out_free_buf_zone;
 
-       xfsdatad_workqueue = alloc_workqueue("xfsdatad", WQ_MEM_RECLAIM, 1);
-       if (!xfsdatad_workqueue)
-               goto out_destroy_xfslogd_workqueue;
-
-       xfsconvertd_workqueue = alloc_workqueue("xfsconvertd",
-                                               WQ_MEM_RECLAIM, 1);
-       if (!xfsconvertd_workqueue)
-               goto out_destroy_xfsdatad_workqueue;
-
        return 0;
 
- out_destroy_xfsdatad_workqueue:
-       destroy_workqueue(xfsdatad_workqueue);
- out_destroy_xfslogd_workqueue:
-       destroy_workqueue(xfslogd_workqueue);
  out_free_buf_zone:
        kmem_zone_destroy(xfs_buf_zone);
  out:
@@ -1817,8 +1802,6 @@ xfs_buf_init(void)
 void
 xfs_buf_terminate(void)
 {
-       destroy_workqueue(xfsconvertd_workqueue);
-       destroy_workqueue(xfsdatad_workqueue);
        destroy_workqueue(xfslogd_workqueue);
        kmem_zone_destroy(xfs_buf_zone);
 }
index dd974a55c77daee6de56a44c527e871d7cfe7fca..1137bbc5eccba64c1a53fc4ff91a3b950da7b81f 100644 (file)
@@ -215,7 +215,7 @@ xfs_swap_extents(
        xfs_trans_t     *tp;
        xfs_bstat_t     *sbp = &sxp->sx_stat;
        xfs_ifork_t     *tempifp, *ifp, *tifp;
-       int             ilf_fields, tilf_fields;
+       int             src_log_flags, target_log_flags;
        int             error = 0;
        int             aforkblks = 0;
        int             taforkblks = 0;
@@ -385,9 +385,8 @@ xfs_swap_extents(
        tip->i_delayed_blks = ip->i_delayed_blks;
        ip->i_delayed_blks = 0;
 
-       ilf_fields = XFS_ILOG_CORE;
-
-       switch(ip->i_d.di_format) {
+       src_log_flags = XFS_ILOG_CORE;
+       switch (ip->i_d.di_format) {
        case XFS_DINODE_FMT_EXTENTS:
                /* If the extents fit in the inode, fix the
                 * pointer.  Otherwise it's already NULL or
@@ -397,16 +396,15 @@ xfs_swap_extents(
                        ifp->if_u1.if_extents =
                                ifp->if_u2.if_inline_ext;
                }
-               ilf_fields |= XFS_ILOG_DEXT;
+               src_log_flags |= XFS_ILOG_DEXT;
                break;
        case XFS_DINODE_FMT_BTREE:
-               ilf_fields |= XFS_ILOG_DBROOT;
+               src_log_flags |= XFS_ILOG_DBROOT;
                break;
        }
 
-       tilf_fields = XFS_ILOG_CORE;
-
-       switch(tip->i_d.di_format) {
+       target_log_flags = XFS_ILOG_CORE;
+       switch (tip->i_d.di_format) {
        case XFS_DINODE_FMT_EXTENTS:
                /* If the extents fit in the inode, fix the
                 * pointer.  Otherwise it's already NULL or
@@ -416,10 +414,10 @@ xfs_swap_extents(
                        tifp->if_u1.if_extents =
                                tifp->if_u2.if_inline_ext;
                }
-               tilf_fields |= XFS_ILOG_DEXT;
+               target_log_flags |= XFS_ILOG_DEXT;
                break;
        case XFS_DINODE_FMT_BTREE:
-               tilf_fields |= XFS_ILOG_DBROOT;
+               target_log_flags |= XFS_ILOG_DBROOT;
                break;
        }
 
@@ -427,8 +425,8 @@ xfs_swap_extents(
        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
        xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
 
-       xfs_trans_log_inode(tp, ip,  ilf_fields);
-       xfs_trans_log_inode(tp, tip, tilf_fields);
+       xfs_trans_log_inode(tp, ip,  src_log_flags);
+       xfs_trans_log_inode(tp, tip, target_log_flags);
 
        /*
         * If this is a synchronous mount, make sure that the
index 9245e029b8eaddb08f58bbdbe3f78d972cfd2610..d3b63aefd01dbf46f7ec473bd2b01570096a7d0e 100644 (file)
@@ -29,6 +29,7 @@
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
+#include "xfs_dir2.h"
 #include "xfs_dir2_format.h"
 #include "xfs_dir2_priv.h"
 #include "xfs_error.h"
index 53db20ee3e774fab3f643f8c280e018086dffa10..4be16a0cbe5aee7caf7854c720e371b1e9d851c2 100644 (file)
  * Lock order:
  *
  * ip->i_lock
- *   qh->qh_lock
- *     qi->qi_dqlist_lock
- *       dquot->q_qlock (xfs_dqlock() and friends)
- *         dquot->q_flush (xfs_dqflock() and friends)
- *         xfs_Gqm->qm_dqfrlist_lock
+ *   qi->qi_tree_lock
+ *     dquot->q_qlock (xfs_dqlock() and friends)
+ *       dquot->q_flush (xfs_dqflock() and friends)
+ *       qi->qi_lru_lock
  *
  * If two dquots need to be locked the order is user before group/project,
  * otherwise by the lowest id first, see xfs_dqlock2.
@@ -60,6 +59,9 @@ int xfs_dqreq_num;
 int xfs_dqerror_mod = 33;
 #endif
 
+struct kmem_zone               *xfs_qm_dqtrxzone;
+static struct kmem_zone                *xfs_qm_dqzone;
+
 static struct lock_class_key xfs_dquot_other_class;
 
 /*
@@ -69,12 +71,12 @@ void
 xfs_qm_dqdestroy(
        xfs_dquot_t     *dqp)
 {
-       ASSERT(list_empty(&dqp->q_freelist));
+       ASSERT(list_empty(&dqp->q_lru));
 
        mutex_destroy(&dqp->q_qlock);
-       kmem_zone_free(xfs_Gqm->qm_dqzone, dqp);
+       kmem_zone_free(xfs_qm_dqzone, dqp);
 
-       atomic_dec(&xfs_Gqm->qm_totaldquots);
+       XFS_STATS_DEC(xs_qm_dquot);
 }
 
 /*
@@ -282,7 +284,7 @@ xfs_qm_dqalloc(
         * Return if this type of quotas is turned off while we didn't
         * have an inode lock
         */
-       if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
+       if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
                xfs_iunlock(quotip, XFS_ILOCK_EXCL);
                return (ESRCH);
        }
@@ -384,7 +386,7 @@ xfs_qm_dqtobp(
        dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
 
        xfs_ilock(quotip, XFS_ILOCK_SHARED);
-       if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
+       if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
                /*
                 * Return if this type of quotas is turned off while we
                 * didn't have the quota inode lock.
@@ -492,12 +494,12 @@ xfs_qm_dqread(
        int                     cancelflags = 0;
 
 
-       dqp = kmem_zone_zalloc(xfs_Gqm->qm_dqzone, KM_SLEEP);
+       dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP);
 
        dqp->dq_flags = type;
        dqp->q_core.d_id = cpu_to_be32(id);
        dqp->q_mount = mp;
-       INIT_LIST_HEAD(&dqp->q_freelist);
+       INIT_LIST_HEAD(&dqp->q_lru);
        mutex_init(&dqp->q_qlock);
        init_waitqueue_head(&dqp->q_pinwait);
 
@@ -516,7 +518,7 @@ xfs_qm_dqread(
        if (!(type & XFS_DQ_USER))
                lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class);
 
-       atomic_inc(&xfs_Gqm->qm_totaldquots);
+       XFS_STATS_INC(xs_qm_dquot);
 
        trace_xfs_dqread(dqp);
 
@@ -601,60 +603,6 @@ error0:
        return error;
 }
 
-/*
- * Lookup a dquot in the incore dquot hashtable. We keep two separate
- * hashtables for user and group dquots; and, these are global tables
- * inside the XQM, not per-filesystem tables.
- * The hash chain must be locked by caller, and it is left locked
- * on return. Returning dquot is locked.
- */
-STATIC int
-xfs_qm_dqlookup(
-       xfs_mount_t             *mp,
-       xfs_dqid_t              id,
-       xfs_dqhash_t            *qh,
-       xfs_dquot_t             **O_dqpp)
-{
-       xfs_dquot_t             *dqp;
-
-       ASSERT(mutex_is_locked(&qh->qh_lock));
-
-       /*
-        * Traverse the hashchain looking for a match
-        */
-       list_for_each_entry(dqp, &qh->qh_list, q_hashlist) {
-               /*
-                * We already have the hashlock. We don't need the
-                * dqlock to look at the id field of the dquot, since the
-                * id can't be modified without the hashlock anyway.
-                */
-               if (be32_to_cpu(dqp->q_core.d_id) != id || dqp->q_mount != mp)
-                       continue;
-
-               trace_xfs_dqlookup_found(dqp);
-
-               xfs_dqlock(dqp);
-               if (dqp->dq_flags & XFS_DQ_FREEING) {
-                       *O_dqpp = NULL;
-                       xfs_dqunlock(dqp);
-                       return -1;
-               }
-
-               dqp->q_nrefs++;
-
-               /*
-                * move the dquot to the front of the hashchain
-                */
-               list_move(&dqp->q_hashlist, &qh->qh_list);
-               trace_xfs_dqlookup_done(dqp);
-               *O_dqpp = dqp;
-               return 0;
-       }
-
-       *O_dqpp = NULL;
-       return 1;
-}
-
 /*
  * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a
  * a locked dquot, doing an allocation (if requested) as needed.
@@ -672,10 +620,10 @@ xfs_qm_dqget(
        uint            flags,    /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */
        xfs_dquot_t     **O_dqpp) /* OUT : locked incore dquot */
 {
-       xfs_dquot_t     *dqp;
-       xfs_dqhash_t    *h;
-       uint            version;
-       int             error;
+       struct xfs_quotainfo    *qi = mp->m_quotainfo;
+       struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type);
+       struct xfs_dquot        *dqp;
+       int                     error;
 
        ASSERT(XFS_IS_QUOTA_RUNNING(mp));
        if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) ||
@@ -683,7 +631,6 @@ xfs_qm_dqget(
            (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) {
                return (ESRCH);
        }
-       h = XFS_DQ_HASH(mp, id, type);
 
 #ifdef DEBUG
        if (xfs_do_dqerror) {
@@ -699,42 +646,33 @@ xfs_qm_dqget(
               type == XFS_DQ_GROUP);
        if (ip) {
                ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-               if (type == XFS_DQ_USER)
-                       ASSERT(ip->i_udquot == NULL);
-               else
-                       ASSERT(ip->i_gdquot == NULL);
+               ASSERT(xfs_inode_dquot(ip, type) == NULL);
        }
 #endif
 
 restart:
-       mutex_lock(&h->qh_lock);
+       mutex_lock(&qi->qi_tree_lock);
+       dqp = radix_tree_lookup(tree, id);
+       if (dqp) {
+               xfs_dqlock(dqp);
+               if (dqp->dq_flags & XFS_DQ_FREEING) {
+                       xfs_dqunlock(dqp);
+                       mutex_unlock(&qi->qi_tree_lock);
+                       trace_xfs_dqget_freeing(dqp);
+                       delay(1);
+                       goto restart;
+               }
 
-       /*
-        * Look in the cache (hashtable).
-        * The chain is kept locked during lookup.
-        */
-       switch (xfs_qm_dqlookup(mp, id, h, O_dqpp)) {
-       case -1:
-               XQM_STATS_INC(xqmstats.xs_qm_dquot_dups);
-               mutex_unlock(&h->qh_lock);
-               delay(1);
-               goto restart;
-       case 0:
-               XQM_STATS_INC(xqmstats.xs_qm_dqcachehits);
-               /*
-                * The dquot was found, moved to the front of the chain,
-                * taken off the freelist if it was on it, and locked
-                * at this point. Just unlock the hashchain and return.
-                */
-               ASSERT(*O_dqpp);
-               ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp));
-               mutex_unlock(&h->qh_lock);
-               trace_xfs_dqget_hit(*O_dqpp);
-               return 0;       /* success */
-       default:
-               XQM_STATS_INC(xqmstats.xs_qm_dqcachemisses);
-               break;
+               dqp->q_nrefs++;
+               mutex_unlock(&qi->qi_tree_lock);
+
+               trace_xfs_dqget_hit(dqp);
+               XFS_STATS_INC(xs_qm_dqcachehits);
+               *O_dqpp = dqp;
+               return 0;
        }
+       mutex_unlock(&qi->qi_tree_lock);
+       XFS_STATS_INC(xs_qm_dqcachemisses);
 
        /*
         * Dquot cache miss. We don't want to keep the inode lock across
@@ -745,12 +683,6 @@ restart:
         */
        if (ip)
                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       /*
-        * Save the hashchain version stamp, and unlock the chain, so that
-        * we don't keep the lock across a disk read
-        */
-       version = h->qh_version;
-       mutex_unlock(&h->qh_lock);
 
        error = xfs_qm_dqread(mp, id, type, flags, &dqp);
 
@@ -760,97 +692,53 @@ restart:
        if (error)
                return error;
 
-       /*
-        * Dquot lock comes after hashlock in the lock ordering
-        */
        if (ip) {
                /*
                 * A dquot could be attached to this inode by now, since
                 * we had dropped the ilock.
                 */
-               if (type == XFS_DQ_USER) {
-                       if (!XFS_IS_UQUOTA_ON(mp)) {
-                               /* inode stays locked on return */
-                               xfs_qm_dqdestroy(dqp);
-                               return XFS_ERROR(ESRCH);
-                       }
-                       if (ip->i_udquot) {
+               if (xfs_this_quota_on(mp, type)) {
+                       struct xfs_dquot        *dqp1;
+
+                       dqp1 = xfs_inode_dquot(ip, type);
+                       if (dqp1) {
                                xfs_qm_dqdestroy(dqp);
-                               dqp = ip->i_udquot;
+                               dqp = dqp1;
                                xfs_dqlock(dqp);
                                goto dqret;
                        }
                } else {
-                       if (!XFS_IS_OQUOTA_ON(mp)) {
-                               /* inode stays locked on return */
-                               xfs_qm_dqdestroy(dqp);
-                               return XFS_ERROR(ESRCH);
-                       }
-                       if (ip->i_gdquot) {
-                               xfs_qm_dqdestroy(dqp);
-                               dqp = ip->i_gdquot;
-                               xfs_dqlock(dqp);
-                               goto dqret;
-                       }
+                       /* inode stays locked on return */
+                       xfs_qm_dqdestroy(dqp);
+                       return XFS_ERROR(ESRCH);
                }
        }
 
-       /*
-        * Hashlock comes after ilock in lock order
-        */
-       mutex_lock(&h->qh_lock);
-       if (version != h->qh_version) {
-               xfs_dquot_t *tmpdqp;
+       mutex_lock(&qi->qi_tree_lock);
+       error = -radix_tree_insert(tree, id, dqp);
+       if (unlikely(error)) {
+               WARN_ON(error != EEXIST);
+
                /*
-                * Now, see if somebody else put the dquot in the
-                * hashtable before us. This can happen because we didn't
-                * keep the hashchain lock. We don't have to worry about
-                * lock order between the two dquots here since dqp isn't
-                * on any findable lists yet.
+                * Duplicate found. Just throw away the new dquot and start
+                * over.
                 */
-               switch (xfs_qm_dqlookup(mp, id, h, &tmpdqp)) {
-               case 0:
-               case -1:
-                       /*
-                        * Duplicate found, either in cache or on its way out.
-                        * Just throw away the new dquot and start over.
-                        */
-                       if (tmpdqp)
-                               xfs_qm_dqput(tmpdqp);
-                       mutex_unlock(&h->qh_lock);
-                       xfs_qm_dqdestroy(dqp);
-                       XQM_STATS_INC(xqmstats.xs_qm_dquot_dups);
-                       goto restart;
-               default:
-                       break;
-               }
+               mutex_unlock(&qi->qi_tree_lock);
+               trace_xfs_dqget_dup(dqp);
+               xfs_qm_dqdestroy(dqp);
+               XFS_STATS_INC(xs_qm_dquot_dups);
+               goto restart;
        }
 
-       /*
-        * Put the dquot at the beginning of the hash-chain and mp's list
-        * LOCK ORDER: hashlock, freelistlock, mplistlock, udqlock, gdqlock ..
-        */
-       ASSERT(mutex_is_locked(&h->qh_lock));
-       dqp->q_hash = h;
-       list_add(&dqp->q_hashlist, &h->qh_list);
-       h->qh_version++;
-
-       /*
-        * Attach this dquot to this filesystem's list of all dquots,
-        * kept inside the mount structure in m_quotainfo field
-        */
-       mutex_lock(&mp->m_quotainfo->qi_dqlist_lock);
-
        /*
         * We return a locked dquot to the caller, with a reference taken
         */
        xfs_dqlock(dqp);
        dqp->q_nrefs = 1;
 
-       list_add(&dqp->q_mplist, &mp->m_quotainfo->qi_dqlist);
-       mp->m_quotainfo->qi_dquots++;
-       mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
-       mutex_unlock(&h->qh_lock);
+       qi->qi_dquots++;
+       mutex_unlock(&qi->qi_tree_lock);
+
  dqret:
        ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
        trace_xfs_dqget_miss(dqp);
@@ -859,37 +747,22 @@ restart:
 }
 
 
-/*
- * Release a reference to the dquot (decrement ref-count)
- * and unlock it. If there is a group quota attached to this
- * dquot, carefully release that too without tripping over
- * deadlocks'n'stuff.
- */
-void
-xfs_qm_dqput(
+STATIC void
+xfs_qm_dqput_final(
        struct xfs_dquot        *dqp)
 {
+       struct xfs_quotainfo    *qi = dqp->q_mount->m_quotainfo;
        struct xfs_dquot        *gdqp;
 
-       ASSERT(dqp->q_nrefs > 0);
-       ASSERT(XFS_DQ_IS_LOCKED(dqp));
-
-       trace_xfs_dqput(dqp);
-
-recurse:
-       if (--dqp->q_nrefs > 0) {
-               xfs_dqunlock(dqp);
-               return;
-       }
-
        trace_xfs_dqput_free(dqp);
 
-       mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
-       if (list_empty(&dqp->q_freelist)) {
-               list_add_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist);
-               xfs_Gqm->qm_dqfrlist_cnt++;
+       mutex_lock(&qi->qi_lru_lock);
+       if (list_empty(&dqp->q_lru)) {
+               list_add_tail(&dqp->q_lru, &qi->qi_lru_list);
+               qi->qi_lru_count++;
+               XFS_STATS_INC(xs_qm_dquot_unused);
        }
-       mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+       mutex_unlock(&qi->qi_lru_lock);
 
        /*
         * If we just added a udquot to the freelist, then we want to release
@@ -906,10 +779,29 @@ recurse:
        /*
         * If we had a group quota hint, release it now.
         */
-       if (gdqp) {
-               dqp = gdqp;
-               goto recurse;
-       }
+       if (gdqp)
+               xfs_qm_dqput(gdqp);
+}
+
+/*
+ * Release a reference to the dquot (decrement ref-count) and unlock it.
+ *
+ * If there is a group quota attached to this dquot, carefully release that
+ * too without tripping over deadlocks'n'stuff.
+ */
+void
+xfs_qm_dqput(
+       struct xfs_dquot        *dqp)
+{
+       ASSERT(dqp->q_nrefs > 0);
+       ASSERT(XFS_DQ_IS_LOCKED(dqp));
+
+       trace_xfs_dqput(dqp);
+
+       if (--dqp->q_nrefs > 0)
+               xfs_dqunlock(dqp);
+       else
+               xfs_qm_dqput_final(dqp);
 }
 
 /*
@@ -1091,17 +983,6 @@ xfs_qm_dqflush(
 
 }
 
-void
-xfs_dqunlock(
-       xfs_dquot_t *dqp)
-{
-       xfs_dqunlock_nonotify(dqp);
-       if (dqp->q_logitem.qli_dquot == dqp) {
-               xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_ailp,
-                                       &dqp->q_logitem.qli_item);
-       }
-}
-
 /*
  * Lock two xfs_dquot structures.
  *
@@ -1130,85 +1011,6 @@ xfs_dqlock2(
        }
 }
 
-/*
- * Take a dquot out of the mount's dqlist as well as the hashlist.  This is
- * called via unmount as well as quotaoff, and the purge will always succeed.
- */
-void
-xfs_qm_dqpurge(
-       struct xfs_dquot        *dqp)
-{
-       struct xfs_mount        *mp = dqp->q_mount;
-       struct xfs_dqhash       *qh = dqp->q_hash;
-
-       xfs_dqlock(dqp);
-
-       /*
-        * If we're turning off quotas, we have to make sure that, for
-        * example, we don't delete quota disk blocks while dquots are
-        * in the process of getting written to those disk blocks.
-        * This dquot might well be on AIL, and we can't leave it there
-        * if we're turning off quotas. Basically, we need this flush
-        * lock, and are willing to block on it.
-        */
-       if (!xfs_dqflock_nowait(dqp)) {
-               /*
-                * Block on the flush lock after nudging dquot buffer,
-                * if it is incore.
-                */
-               xfs_dqflock_pushbuf_wait(dqp);
-       }
-
-       /*
-        * If we are turning this type of quotas off, we don't care
-        * about the dirty metadata sitting in this dquot. OTOH, if
-        * we're unmounting, we do care, so we flush it and wait.
-        */
-       if (XFS_DQ_IS_DIRTY(dqp)) {
-               int     error;
-
-               /*
-                * We don't care about getting disk errors here. We need
-                * to purge this dquot anyway, so we go ahead regardless.
-                */
-               error = xfs_qm_dqflush(dqp, SYNC_WAIT);
-               if (error)
-                       xfs_warn(mp, "%s: dquot %p flush failed",
-                               __func__, dqp);
-               xfs_dqflock(dqp);
-       }
-
-       ASSERT(atomic_read(&dqp->q_pincount) == 0);
-       ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
-              !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
-
-       xfs_dqfunlock(dqp);
-       xfs_dqunlock(dqp);
-
-       mutex_lock(&qh->qh_lock);
-       list_del_init(&dqp->q_hashlist);
-       qh->qh_version++;
-       mutex_unlock(&qh->qh_lock);
-
-       mutex_lock(&mp->m_quotainfo->qi_dqlist_lock);
-       list_del_init(&dqp->q_mplist);
-       mp->m_quotainfo->qi_dqreclaims++;
-       mp->m_quotainfo->qi_dquots--;
-       mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
-
-       /*
-        * We move dquots to the freelist as soon as their reference count
-        * hits zero, so it really should be on the freelist here.
-        */
-       mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
-       ASSERT(!list_empty(&dqp->q_freelist));
-       list_del_init(&dqp->q_freelist);
-       xfs_Gqm->qm_dqfrlist_cnt--;
-       mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
-
-       xfs_qm_dqdestroy(dqp);
-}
-
 /*
  * Give the buffer a little push if it is incore and
  * wait on the flush lock.
@@ -1241,3 +1043,31 @@ xfs_dqflock_pushbuf_wait(
 out_lock:
        xfs_dqflock(dqp);
 }
+
+int __init
+xfs_qm_init(void)
+{
+       xfs_qm_dqzone =
+               kmem_zone_init(sizeof(struct xfs_dquot), "xfs_dquot");
+       if (!xfs_qm_dqzone)
+               goto out;
+
+       xfs_qm_dqtrxzone =
+               kmem_zone_init(sizeof(struct xfs_dquot_acct), "xfs_dqtrx");
+       if (!xfs_qm_dqtrxzone)
+               goto out_free_dqzone;
+
+       return 0;
+
+out_free_dqzone:
+       kmem_zone_destroy(xfs_qm_dqzone);
+out:
+       return -ENOMEM;
+}
+
+void __exit
+xfs_qm_exit(void)
+{
+       kmem_zone_destroy(xfs_qm_dqtrxzone);
+       kmem_zone_destroy(xfs_qm_dqzone);
+}
index a1d91d8f18027b9e217f07ff8769d9d4a21fe2a1..ef9190bd8b300a061244d1d294a90acf39610b08 100644 (file)
  * when quotas are off.
  */
 
-/*
- * The hash chain headers (hash buckets)
- */
-typedef struct xfs_dqhash {
-       struct list_head  qh_list;
-       struct mutex      qh_lock;
-       uint              qh_version;   /* ever increasing version */
-       uint              qh_nelems;    /* number of dquots on the list */
-} xfs_dqhash_t;
-
 struct xfs_mount;
 struct xfs_trans;
 
@@ -47,10 +37,7 @@ struct xfs_trans;
  */
 typedef struct xfs_dquot {
        uint             dq_flags;      /* various flags (XFS_DQ_*) */
-       struct list_head q_freelist;    /* global free list of dquots */
-       struct list_head q_mplist;      /* mount's list of dquots */
-       struct list_head q_hashlist;    /* gloabl hash list of dquots */
-       xfs_dqhash_t    *q_hash;        /* the hashchain header */
+       struct list_head q_lru;         /* global free list of dquots */
        struct xfs_mount*q_mount;       /* filesystem this relates to */
        struct xfs_trans*q_transp;      /* trans this belongs to currently */
        uint             q_nrefs;       /* # active refs from inodes */
@@ -110,11 +97,37 @@ static inline void xfs_dqlock(struct xfs_dquot *dqp)
        mutex_lock(&dqp->q_qlock);
 }
 
-static inline void xfs_dqunlock_nonotify(struct xfs_dquot *dqp)
+static inline void xfs_dqunlock(struct xfs_dquot *dqp)
 {
        mutex_unlock(&dqp->q_qlock);
 }
 
+static inline int xfs_this_quota_on(struct xfs_mount *mp, int type)
+{
+       switch (type & XFS_DQ_ALLTYPES) {
+       case XFS_DQ_USER:
+               return XFS_IS_UQUOTA_ON(mp);
+       case XFS_DQ_GROUP:
+       case XFS_DQ_PROJ:
+               return XFS_IS_OQUOTA_ON(mp);
+       default:
+               return 0;
+       }
+}
+
+static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type)
+{
+       switch (type & XFS_DQ_ALLTYPES) {
+       case XFS_DQ_USER:
+               return ip->i_udquot;
+       case XFS_DQ_GROUP:
+       case XFS_DQ_PROJ:
+               return ip->i_gdquot;
+       default:
+               return NULL;
+       }
+}
+
 #define XFS_DQ_IS_LOCKED(dqp)  (mutex_is_locked(&((dqp)->q_qlock)))
 #define XFS_DQ_IS_DIRTY(dqp)   ((dqp)->dq_flags & XFS_DQ_DIRTY)
 #define XFS_QM_ISUDQ(dqp)      ((dqp)->dq_flags & XFS_DQ_USER)
@@ -125,15 +138,10 @@ static inline void xfs_dqunlock_nonotify(struct xfs_dquot *dqp)
                                 XFS_DQ_TO_QINF(dqp)->qi_uquotaip : \
                                 XFS_DQ_TO_QINF(dqp)->qi_gquotaip)
 
-#define XFS_IS_THIS_QUOTA_OFF(d) (! (XFS_QM_ISUDQ(d) ? \
-                                    (XFS_IS_UQUOTA_ON((d)->q_mount)) : \
-                                    (XFS_IS_OQUOTA_ON((d)->q_mount))))
-
 extern int             xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint,
                                        uint, struct xfs_dquot  **);
 extern void            xfs_qm_dqdestroy(xfs_dquot_t *);
 extern int             xfs_qm_dqflush(xfs_dquot_t *, uint);
-extern void            xfs_qm_dqpurge(xfs_dquot_t *);
 extern void            xfs_qm_dqunpin_wait(xfs_dquot_t *);
 extern void            xfs_qm_adjust_dqtimers(xfs_mount_t *,
                                        xfs_disk_dquot_t *);
@@ -144,7 +152,6 @@ extern int          xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *,
 extern void            xfs_qm_dqput(xfs_dquot_t *);
 
 extern void            xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *);
-extern void            xfs_dqunlock(struct xfs_dquot *);
 extern void            xfs_dqflock_pushbuf_wait(struct xfs_dquot *dqp);
 
 static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
index 7e5bc872f2b4fb12d67f3da3796f3c5b86ac162c..54a67dd9ac0a5fbe5a7caf4271d14c67798fd872 100644 (file)
@@ -163,7 +163,6 @@ xfs_file_fsync(
        struct inode            *inode = file->f_mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
        struct xfs_mount        *mp = ip->i_mount;
-       struct xfs_trans        *tp;
        int                     error = 0;
        int                     log_flushed = 0;
        xfs_lsn_t               lsn = 0;
@@ -194,75 +193,18 @@ xfs_file_fsync(
        }
 
        /*
-        * We always need to make sure that the required inode state is safe on
-        * disk.  The inode might be clean but we still might need to force the
-        * log because of committed transactions that haven't hit the disk yet.
-        * Likewise, there could be unflushed non-transactional changes to the
-        * inode core that have to go to disk and this requires us to issue
-        * a synchronous transaction to capture these changes correctly.
-        *
-        * This code relies on the assumption that if the i_update_core field
-        * of the inode is clear and the inode is unpinned then it is clean
-        * and no action is required.
+        * All metadata updates are logged, which means that we just have
+        * to flush the log up to the latest LSN that touched the inode.
         */
        xfs_ilock(ip, XFS_ILOCK_SHARED);
-
-       /*
-        * First check if the VFS inode is marked dirty.  All the dirtying
-        * of non-transactional updates do not go through mark_inode_dirty*,
-        * which allows us to distinguish between pure timestamp updates
-        * and i_size updates which need to be caught for fdatasync.
-        * After that also check for the dirty state in the XFS inode, which
-        * might gets cleared when the inode gets written out via the AIL
-        * or xfs_iflush_cluster.
-        */
-       if (((inode->i_state & I_DIRTY_DATASYNC) ||
-           ((inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
-           ip->i_update_core) {
-               /*
-                * Kick off a transaction to log the inode core to get the
-                * updates.  The sync transaction will also force the log.
-                */
-               xfs_iunlock(ip, XFS_ILOCK_SHARED);
-               tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
-               error = xfs_trans_reserve(tp, 0,
-                               XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
-               if (error) {
-                       xfs_trans_cancel(tp, 0);
-                       return -error;
-               }
-               xfs_ilock(ip, XFS_ILOCK_EXCL);
-
-               /*
-                * Note - it's possible that we might have pushed ourselves out
-                * of the way during trans_reserve which would flush the inode.
-                * But there's no guarantee that the inode buffer has actually
-                * gone out yet (it's delwri).  Plus the buffer could be pinned
-                * anyway if it's part of an inode in another recent
-                * transaction.  So we play it safe and fire off the
-                * transaction anyway.
-                */
-               xfs_trans_ijoin(tp, ip, 0);
-               xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-               error = xfs_trans_commit(tp, 0);
-
-               lsn = ip->i_itemp->ili_last_lsn;
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       } else {
-               /*
-                * Timestamps/size haven't changed since last inode flush or
-                * inode transaction commit.  That means either nothing got
-                * written or a transaction committed which caught the updates.
-                * If the latter happened and the transaction hasn't hit the
-                * disk yet, the inode will be still be pinned.  If it is,
-                * force the log.
-                */
-               if (xfs_ipincount(ip))
+       if (xfs_ipincount(ip)) {
+               if (!datasync ||
+                   (ip->i_itemp->ili_fields & ~XFS_ILOG_TIMESTAMP))
                        lsn = ip->i_itemp->ili_last_lsn;
-               xfs_iunlock(ip, XFS_ILOCK_SHARED);
        }
+       xfs_iunlock(ip, XFS_ILOCK_SHARED);
 
-       if (!error && lsn)
+       if (lsn)
                error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
 
        /*
@@ -659,9 +601,6 @@ restart:
                return error;
        }
 
-       if (likely(!(file->f_mode & FMODE_NOCMTIME)))
-               file_update_time(file);
-
        /*
         * If the offset is beyond the size of the file, we need to zero any
         * blocks that fall between the existing EOF and the start of this
@@ -684,6 +623,15 @@ restart:
        if (error)
                return error;
 
+       /*
+        * Updating the timestamps will grab the ilock again from
+        * xfs_fs_dirty_inode, so we have to call it after dropping the
+        * lock above.  Eventually we should look into a way to avoid
+        * the pointless lock roundtrip.
+        */
+       if (likely(!(file->f_mode & FMODE_NOCMTIME)))
+               file_update_time(file);
+
        /*
         * If we're writing the file then make sure to clear the setuid and
         * setgid bits if the process is not being run by root.  This keeps
index 8c3e46394d484c3fbd798913c6862d90b3f820e7..a98cb4524e6cbc8d0c5064813014c36963bc03e0 100644 (file)
@@ -91,7 +91,6 @@ xfs_inode_alloc(
        ip->i_afp = NULL;
        memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
        ip->i_flags = 0;
-       ip->i_update_core = 0;
        ip->i_delayed_blks = 0;
        memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
 
@@ -350,9 +349,20 @@ xfs_iget_cache_miss(
                        BUG();
        }
 
-       spin_lock(&pag->pag_ici_lock);
+       /*
+        * These values must be set before inserting the inode into the radix
+        * tree as the moment it is inserted a concurrent lookup (allowed by the
+        * RCU locking mechanism) can find it and that lookup must see that this
+        * is an inode currently under construction (i.e. that XFS_INEW is set).
+        * The ip->i_flags_lock that protects the XFS_INEW flag forms the
+        * memory barrier that ensures this detection works correctly at lookup
+        * time.
+        */
+       ip->i_udquot = ip->i_gdquot = NULL;
+       xfs_iflags_set(ip, XFS_INEW);
 
        /* insert the new inode */
+       spin_lock(&pag->pag_ici_lock);
        error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
        if (unlikely(error)) {
                WARN_ON(error != -EEXIST);
@@ -360,11 +370,6 @@ xfs_iget_cache_miss(
                error = EAGAIN;
                goto out_preload_end;
        }
-
-       /* These values _must_ be set before releasing the radix tree lock! */
-       ip->i_udquot = ip->i_gdquot = NULL;
-       xfs_iflags_set(ip, XFS_INEW);
-
        spin_unlock(&pag->pag_ici_lock);
        radix_tree_preload_end();
 
@@ -418,6 +423,15 @@ xfs_iget(
        xfs_perag_t     *pag;
        xfs_agino_t     agino;
 
+       /*
+        * xfs_reclaim_inode() uses the ILOCK to ensure an inode
+        * doesn't get freed while it's being referenced during a
+        * radix tree traversal here.  It assumes this function
+        * aqcuires only the ILOCK (and therefore it has no need to
+        * involve the IOLOCK in this synchronization).
+        */
+       ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
+
        /* reject inode numbers outside existing AGs */
        if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
                return EINVAL;
@@ -642,8 +656,7 @@ xfs_iunlock(
               (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
-       ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_IUNLOCK_NONOTIFY |
-                       XFS_LOCK_DEP_MASK)) == 0);
+       ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
        ASSERT(lock_flags != 0);
 
        if (lock_flags & XFS_IOLOCK_EXCL)
@@ -656,16 +669,6 @@ xfs_iunlock(
        else if (lock_flags & XFS_ILOCK_SHARED)
                mrunlock_shared(&ip->i_lock);
 
-       if ((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) &&
-           !(lock_flags & XFS_IUNLOCK_NONOTIFY) && ip->i_itemp) {
-               /*
-                * Let the AIL know that this item has been unlocked in case
-                * it is in the AIL and anyone is waiting on it.  Don't do
-                * this if the caller has asked us not to.
-                */
-               xfs_trans_unlocked_item(ip->i_itemp->ili_item.li_ailp,
-                                       (xfs_log_item_t*)(ip->i_itemp));
-       }
        trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
 }
 
index b21022499c2e8f302699f80ca2af344301fee941..bc46c0a133d373d3afffad99f1323a8d15b52c19 100644 (file)
@@ -1656,14 +1656,13 @@ retry:
                        iip = ip->i_itemp;
                        if (!iip || xfs_inode_clean(ip)) {
                                ASSERT(ip != free_ip);
-                               ip->i_update_core = 0;
                                xfs_ifunlock(ip);
                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
                                continue;
                        }
 
-                       iip->ili_last_fields = iip->ili_format.ilf_fields;
-                       iip->ili_format.ilf_fields = 0;
+                       iip->ili_last_fields = iip->ili_fields;
+                       iip->ili_fields = 0;
                        iip->ili_logged = 1;
                        xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
                                                &iip->ili_item.li_lsn);
@@ -2177,7 +2176,7 @@ xfs_iflush_fork(
        mp = ip->i_mount;
        switch (XFS_IFORK_FORMAT(ip, whichfork)) {
        case XFS_DINODE_FMT_LOCAL:
-               if ((iip->ili_format.ilf_fields & dataflag[whichfork]) &&
+               if ((iip->ili_fields & dataflag[whichfork]) &&
                    (ifp->if_bytes > 0)) {
                        ASSERT(ifp->if_u1.if_data != NULL);
                        ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
@@ -2187,8 +2186,8 @@ xfs_iflush_fork(
 
        case XFS_DINODE_FMT_EXTENTS:
                ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
-                      !(iip->ili_format.ilf_fields & extflag[whichfork]));
-               if ((iip->ili_format.ilf_fields & extflag[whichfork]) &&
+                      !(iip->ili_fields & extflag[whichfork]));
+               if ((iip->ili_fields & extflag[whichfork]) &&
                    (ifp->if_bytes > 0)) {
                        ASSERT(xfs_iext_get_ext(ifp, 0));
                        ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
@@ -2198,7 +2197,7 @@ xfs_iflush_fork(
                break;
 
        case XFS_DINODE_FMT_BTREE:
-               if ((iip->ili_format.ilf_fields & brootflag[whichfork]) &&
+               if ((iip->ili_fields & brootflag[whichfork]) &&
                    (ifp->if_broot_bytes > 0)) {
                        ASSERT(ifp->if_broot != NULL);
                        ASSERT(ifp->if_broot_bytes <=
@@ -2211,14 +2210,14 @@ xfs_iflush_fork(
                break;
 
        case XFS_DINODE_FMT_DEV:
-               if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) {
+               if (iip->ili_fields & XFS_ILOG_DEV) {
                        ASSERT(whichfork == XFS_DATA_FORK);
                        xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev);
                }
                break;
 
        case XFS_DINODE_FMT_UUID:
-               if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) {
+               if (iip->ili_fields & XFS_ILOG_UUID) {
                        ASSERT(whichfork == XFS_DATA_FORK);
                        memcpy(XFS_DFORK_DPTR(dip),
                               &ip->i_df.if_u2.if_uuid,
@@ -2451,9 +2450,8 @@ xfs_iflush(
         * to disk, because the log record didn't make it to disk!
         */
        if (XFS_FORCED_SHUTDOWN(mp)) {
-               ip->i_update_core = 0;
                if (iip)
-                       iip->ili_format.ilf_fields = 0;
+                       iip->ili_fields = 0;
                xfs_ifunlock(ip);
                return XFS_ERROR(EIO);
        }
@@ -2533,26 +2531,6 @@ xfs_iflush_int(
        /* set *dip = inode's place in the buffer */
        dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
 
-       /*
-        * Clear i_update_core before copying out the data.
-        * This is for coordination with our timestamp updates
-        * that don't hold the inode lock. They will always
-        * update the timestamps BEFORE setting i_update_core,
-        * so if we clear i_update_core after they set it we
-        * are guaranteed to see their updates to the timestamps.
-        * I believe that this depends on strongly ordered memory
-        * semantics, but we have that.  We use the SYNCHRONIZE
-        * macro to make sure that the compiler does not reorder
-        * the i_update_core access below the data copy below.
-        */
-       ip->i_update_core = 0;
-       SYNCHRONIZE();
-
-       /*
-        * Make sure to get the latest timestamps from the Linux inode.
-        */
-       xfs_synchronize_times(ip);
-
        if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
                               mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
                xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
@@ -2663,36 +2641,33 @@ xfs_iflush_int(
        xfs_inobp_check(mp, bp);
 
        /*
-        * We've recorded everything logged in the inode, so we'd
-        * like to clear the ilf_fields bits so we don't log and
-        * flush things unnecessarily.  However, we can't stop
-        * logging all this information until the data we've copied
-        * into the disk buffer is written to disk.  If we did we might
-        * overwrite the copy of the inode in the log with all the
-        * data after re-logging only part of it, and in the face of
-        * a crash we wouldn't have all the data we need to recover.
+        * We've recorded everything logged in the inode, so we'd like to clear
+        * the ili_fields bits so we don't log and flush things unnecessarily.
+        * However, we can't stop logging all this information until the data
+        * we've copied into the disk buffer is written to disk.  If we did we
+        * might overwrite the copy of the inode in the log with all the data
+        * after re-logging only part of it, and in the face of a crash we
+        * wouldn't have all the data we need to recover.
         *
-        * What we do is move the bits to the ili_last_fields field.
-        * When logging the inode, these bits are moved back to the
-        * ilf_fields field.  In the xfs_iflush_done() routine we
-        * clear ili_last_fields, since we know that the information
-        * those bits represent is permanently on disk.  As long as
-        * the flush completes before the inode is logged again, then
-        * both ilf_fields and ili_last_fields will be cleared.
+        * What we do is move the bits to the ili_last_fields field.  When
+        * logging the inode, these bits are moved back to the ili_fields field.
+        * In the xfs_iflush_done() routine we clear ili_last_fields, since we
+        * know that the information those bits represent is permanently on
+        * disk.  As long as the flush completes before the inode is logged
+        * again, then both ili_fields and ili_last_fields will be cleared.
         *
-        * We can play with the ilf_fields bits here, because the inode
-        * lock must be held exclusively in order to set bits there
-        * and the flush lock protects the ili_last_fields bits.
-        * Set ili_logged so the flush done
-        * routine can tell whether or not to look in the AIL.
-        * Also, store the current LSN of the inode so that we can tell
-        * whether the item has moved in the AIL from xfs_iflush_done().
-        * In order to read the lsn we need the AIL lock, because
-        * it is a 64 bit value that cannot be read atomically.
+        * We can play with the ili_fields bits here, because the inode lock
+        * must be held exclusively in order to set bits there and the flush
+        * lock protects the ili_last_fields bits.  Set ili_logged so the flush
+        * done routine can tell whether or not to look in the AIL.  Also, store
+        * the current LSN of the inode so that we can tell whether the item has
+        * moved in the AIL from xfs_iflush_done().  In order to read the lsn we
+        * need the AIL lock, because it is a 64 bit value that cannot be read
+        * atomically.
         */
-       if (iip != NULL && iip->ili_format.ilf_fields != 0) {
-               iip->ili_last_fields = iip->ili_format.ilf_fields;
-               iip->ili_format.ilf_fields = 0;
+       if (iip != NULL && iip->ili_fields != 0) {
+               iip->ili_last_fields = iip->ili_fields;
+               iip->ili_fields = 0;
                iip->ili_logged = 1;
 
                xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
@@ -2711,8 +2686,7 @@ xfs_iflush_int(
        } else {
                /*
                 * We're flushing an inode which is not in the AIL and has
-                * not been logged but has i_update_core set.  For this
-                * case we can use a B_DELWRI flush and immediately drop
+                * not been logged.  For this case we can immediately drop
                 * the inode flush lock because we can avoid the whole
                 * AIL state thing.  It's OK to drop the flush lock now,
                 * because we've already locked the buffer and to do anything
index 2f27b745408520b73bab9bd8a1a2ca4ed1f96ea0..f123dbe6d42a0e4203f1db43980dd29e097311f9 100644 (file)
@@ -241,7 +241,6 @@ typedef struct xfs_inode {
        spinlock_t              i_flags_lock;   /* inode i_flags lock */
        /* Miscellaneous state. */
        unsigned long           i_flags;        /* see defined flags below */
-       unsigned char           i_update_core;  /* timestamps/size is dirty */
        unsigned int            i_delayed_blks; /* count of delay alloc blks */
 
        xfs_icdinode_t          i_d;            /* most of ondisk inode */
@@ -274,6 +273,20 @@ static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip)
        return ip->i_d.di_size;
 }
 
+/*
+ * If this I/O goes past the on-disk inode size update it unless it would
+ * be past the current in-core inode size.
+ */
+static inline xfs_fsize_t
+xfs_new_eof(struct xfs_inode *ip, xfs_fsize_t new_size)
+{
+       xfs_fsize_t i_size = i_size_read(VFS_I(ip));
+
+       if (new_size > i_size)
+               new_size = i_size;
+       return new_size > ip->i_d.di_size ? new_size : 0;
+}
+
 /*
  * i_flags helper functions
  */
@@ -422,7 +435,6 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
 #define        XFS_IOLOCK_SHARED       (1<<1)
 #define        XFS_ILOCK_EXCL          (1<<2)
 #define        XFS_ILOCK_SHARED        (1<<3)
-#define        XFS_IUNLOCK_NONOTIFY    (1<<4)
 
 #define XFS_LOCK_MASK          (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \
                                | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)
@@ -431,8 +443,7 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
        { XFS_IOLOCK_EXCL,      "IOLOCK_EXCL" }, \
        { XFS_IOLOCK_SHARED,    "IOLOCK_SHARED" }, \
        { XFS_ILOCK_EXCL,       "ILOCK_EXCL" }, \
-       { XFS_ILOCK_SHARED,     "ILOCK_SHARED" }, \
-       { XFS_IUNLOCK_NONOTIFY, "IUNLOCK_NONOTIFY" }
+       { XFS_ILOCK_SHARED,     "ILOCK_SHARED" }
 
 
 /*
@@ -522,10 +533,6 @@ void               xfs_promote_inode(struct xfs_inode *);
 void           xfs_lock_inodes(xfs_inode_t **, int, uint);
 void           xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
 
-void           xfs_synchronize_times(xfs_inode_t *);
-void           xfs_mark_inode_dirty(xfs_inode_t *);
-void           xfs_mark_inode_dirty_sync(xfs_inode_t *);
-
 #define IHOLD(ip) \
 do { \
        ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
index 91d71dcd4852eed6339bd1ceb54a8dbdf04cd27a..05d924efceafb68940a5dd072a719b42133d152a 100644 (file)
@@ -57,77 +57,28 @@ xfs_inode_item_size(
        struct xfs_inode        *ip = iip->ili_inode;
        uint                    nvecs = 2;
 
-       /*
-        * Only log the data/extents/b-tree root if there is something
-        * left to log.
-        */
-       iip->ili_format.ilf_fields |= XFS_ILOG_CORE;
-
        switch (ip->i_d.di_format) {
        case XFS_DINODE_FMT_EXTENTS:
-               iip->ili_format.ilf_fields &=
-                       ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
-                         XFS_ILOG_DEV | XFS_ILOG_UUID);
-               if ((iip->ili_format.ilf_fields & XFS_ILOG_DEXT) &&
-                   (ip->i_d.di_nextents > 0) &&
-                   (ip->i_df.if_bytes > 0)) {
-                       ASSERT(ip->i_df.if_u1.if_extents != NULL);
+               if ((iip->ili_fields & XFS_ILOG_DEXT) &&
+                   ip->i_d.di_nextents > 0 &&
+                   ip->i_df.if_bytes > 0)
                        nvecs++;
-               } else {
-                       iip->ili_format.ilf_fields &= ~XFS_ILOG_DEXT;
-               }
                break;
 
        case XFS_DINODE_FMT_BTREE:
-               iip->ili_format.ilf_fields &=
-                       ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT |
-                         XFS_ILOG_DEV | XFS_ILOG_UUID);
-               if ((iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) &&
-                   (ip->i_df.if_broot_bytes > 0)) {
-                       ASSERT(ip->i_df.if_broot != NULL);
+               if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
+                   ip->i_df.if_broot_bytes > 0)
                        nvecs++;
-               } else {
-                       ASSERT(!(iip->ili_format.ilf_fields &
-                                XFS_ILOG_DBROOT));
-#ifdef XFS_TRANS_DEBUG
-                       if (iip->ili_root_size > 0) {
-                               ASSERT(iip->ili_root_size ==
-                                      ip->i_df.if_broot_bytes);
-                               ASSERT(memcmp(iip->ili_orig_root,
-                                           ip->i_df.if_broot,
-                                           iip->ili_root_size) == 0);
-                       } else {
-                               ASSERT(ip->i_df.if_broot_bytes == 0);
-                       }
-#endif
-                       iip->ili_format.ilf_fields &= ~XFS_ILOG_DBROOT;
-               }
                break;
 
        case XFS_DINODE_FMT_LOCAL:
-               iip->ili_format.ilf_fields &=
-                       ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT |
-                         XFS_ILOG_DEV | XFS_ILOG_UUID);
-               if ((iip->ili_format.ilf_fields & XFS_ILOG_DDATA) &&
-                   (ip->i_df.if_bytes > 0)) {
-                       ASSERT(ip->i_df.if_u1.if_data != NULL);
-                       ASSERT(ip->i_d.di_size > 0);
+               if ((iip->ili_fields & XFS_ILOG_DDATA) &&
+                   ip->i_df.if_bytes > 0)
                        nvecs++;
-               } else {
-                       iip->ili_format.ilf_fields &= ~XFS_ILOG_DDATA;
-               }
                break;
 
        case XFS_DINODE_FMT_DEV:
-               iip->ili_format.ilf_fields &=
-                       ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
-                         XFS_ILOG_DEXT | XFS_ILOG_UUID);
-               break;
-
        case XFS_DINODE_FMT_UUID:
-               iip->ili_format.ilf_fields &=
-                       ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
-                         XFS_ILOG_DEXT | XFS_ILOG_DEV);
                break;
 
        default:
@@ -135,56 +86,31 @@ xfs_inode_item_size(
                break;
        }
 
-       /*
-        * If there are no attributes associated with this file,
-        * then there cannot be anything more to log.
-        * Clear all attribute-related log flags.
-        */
-       if (!XFS_IFORK_Q(ip)) {
-               iip->ili_format.ilf_fields &=
-                       ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT);
+       if (!XFS_IFORK_Q(ip))
                return nvecs;
-       }
+
 
        /*
         * Log any necessary attribute data.
         */
        switch (ip->i_d.di_aformat) {
        case XFS_DINODE_FMT_EXTENTS:
-               iip->ili_format.ilf_fields &=
-                       ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT);
-               if ((iip->ili_format.ilf_fields & XFS_ILOG_AEXT) &&
-                   (ip->i_d.di_anextents > 0) &&
-                   (ip->i_afp->if_bytes > 0)) {
-                       ASSERT(ip->i_afp->if_u1.if_extents != NULL);
+               if ((iip->ili_fields & XFS_ILOG_AEXT) &&
+                   ip->i_d.di_anextents > 0 &&
+                   ip->i_afp->if_bytes > 0)
                        nvecs++;
-               } else {
-                       iip->ili_format.ilf_fields &= ~XFS_ILOG_AEXT;
-               }
                break;
 
        case XFS_DINODE_FMT_BTREE:
-               iip->ili_format.ilf_fields &=
-                       ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT);
-               if ((iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) &&
-                   (ip->i_afp->if_broot_bytes > 0)) {
-                       ASSERT(ip->i_afp->if_broot != NULL);
+               if ((iip->ili_fields & XFS_ILOG_ABROOT) &&
+                   ip->i_afp->if_broot_bytes > 0)
                        nvecs++;
-               } else {
-                       iip->ili_format.ilf_fields &= ~XFS_ILOG_ABROOT;
-               }
                break;
 
        case XFS_DINODE_FMT_LOCAL:
-               iip->ili_format.ilf_fields &=
-                       ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT);
-               if ((iip->ili_format.ilf_fields & XFS_ILOG_ADATA) &&
-                   (ip->i_afp->if_bytes > 0)) {
-                       ASSERT(ip->i_afp->if_u1.if_data != NULL);
+               if ((iip->ili_fields & XFS_ILOG_ADATA) &&
+                   ip->i_afp->if_bytes > 0)
                        nvecs++;
-               } else {
-                       iip->ili_format.ilf_fields &= ~XFS_ILOG_ADATA;
-               }
                break;
 
        default:
@@ -254,48 +180,11 @@ xfs_inode_item_format(
        vecp++;
        nvecs        = 1;
 
-       /*
-        * Clear i_update_core if the timestamps (or any other
-        * non-transactional modification) need flushing/logging
-        * and we're about to log them with the rest of the core.
-        *
-        * This is the same logic as xfs_iflush() but this code can't
-        * run at the same time as xfs_iflush because we're in commit
-        * processing here and so we have the inode lock held in
-        * exclusive mode.  Although it doesn't really matter
-        * for the timestamps if both routines were to grab the
-        * timestamps or not.  That would be ok.
-        *
-        * We clear i_update_core before copying out the data.
-        * This is for coordination with our timestamp updates
-        * that don't hold the inode lock. They will always
-        * update the timestamps BEFORE setting i_update_core,
-        * so if we clear i_update_core after they set it we
-        * are guaranteed to see their updates to the timestamps
-        * either here.  Likewise, if they set it after we clear it
-        * here, we'll see it either on the next commit of this
-        * inode or the next time the inode gets flushed via
-        * xfs_iflush().  This depends on strongly ordered memory
-        * semantics, but we have that.  We use the SYNCHRONIZE
-        * macro to make sure that the compiler does not reorder
-        * the i_update_core access below the data copy below.
-        */
-       if (ip->i_update_core)  {
-               ip->i_update_core = 0;
-               SYNCHRONIZE();
-       }
-
-       /*
-        * Make sure to get the latest timestamps from the Linux inode.
-        */
-       xfs_synchronize_times(ip);
-
        vecp->i_addr = &ip->i_d;
        vecp->i_len  = sizeof(struct xfs_icdinode);
        vecp->i_type = XLOG_REG_TYPE_ICORE;
        vecp++;
        nvecs++;
-       iip->ili_format.ilf_fields |= XFS_ILOG_CORE;
 
        /*
         * If this is really an old format inode, then we need to
@@ -328,16 +217,17 @@ xfs_inode_item_format(
 
        switch (ip->i_d.di_format) {
        case XFS_DINODE_FMT_EXTENTS:
-               ASSERT(!(iip->ili_format.ilf_fields &
-                        (XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
-                         XFS_ILOG_DEV | XFS_ILOG_UUID)));
-               if (iip->ili_format.ilf_fields & XFS_ILOG_DEXT) {
-                       ASSERT(ip->i_df.if_bytes > 0);
+               iip->ili_fields &=
+                       ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
+                         XFS_ILOG_DEV | XFS_ILOG_UUID);
+
+               if ((iip->ili_fields & XFS_ILOG_DEXT) &&
+                   ip->i_d.di_nextents > 0 &&
+                   ip->i_df.if_bytes > 0) {
                        ASSERT(ip->i_df.if_u1.if_extents != NULL);
-                       ASSERT(ip->i_d.di_nextents > 0);
+                       ASSERT(ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) > 0);
                        ASSERT(iip->ili_extents_buf == NULL);
-                       ASSERT((ip->i_df.if_bytes /
-                               (uint)sizeof(xfs_bmbt_rec_t)) > 0);
+
 #ifdef XFS_NATIVE_HOST
                        if (ip->i_d.di_nextents == ip->i_df.if_bytes /
                                                (uint)sizeof(xfs_bmbt_rec_t)) {
@@ -359,15 +249,18 @@ xfs_inode_item_format(
                        iip->ili_format.ilf_dsize = vecp->i_len;
                        vecp++;
                        nvecs++;
+               } else {
+                       iip->ili_fields &= ~XFS_ILOG_DEXT;
                }
                break;
 
        case XFS_DINODE_FMT_BTREE:
-               ASSERT(!(iip->ili_format.ilf_fields &
-                        (XFS_ILOG_DDATA | XFS_ILOG_DEXT |
-                         XFS_ILOG_DEV | XFS_ILOG_UUID)));
-               if (iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) {
-                       ASSERT(ip->i_df.if_broot_bytes > 0);
+               iip->ili_fields &=
+                       ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT |
+                         XFS_ILOG_DEV | XFS_ILOG_UUID);
+
+               if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
+                   ip->i_df.if_broot_bytes > 0) {
                        ASSERT(ip->i_df.if_broot != NULL);
                        vecp->i_addr = ip->i_df.if_broot;
                        vecp->i_len = ip->i_df.if_broot_bytes;
@@ -375,15 +268,30 @@ xfs_inode_item_format(
                        vecp++;
                        nvecs++;
                        iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes;
+               } else {
+                       ASSERT(!(iip->ili_fields &
+                                XFS_ILOG_DBROOT));
+#ifdef XFS_TRANS_DEBUG
+                       if (iip->ili_root_size > 0) {
+                               ASSERT(iip->ili_root_size ==
+                                      ip->i_df.if_broot_bytes);
+                               ASSERT(memcmp(iip->ili_orig_root,
+                                           ip->i_df.if_broot,
+                                           iip->ili_root_size) == 0);
+                       } else {
+                               ASSERT(ip->i_df.if_broot_bytes == 0);
+                       }
+#endif
+                       iip->ili_fields &= ~XFS_ILOG_DBROOT;
                }
                break;
 
        case XFS_DINODE_FMT_LOCAL:
-               ASSERT(!(iip->ili_format.ilf_fields &
-                        (XFS_ILOG_DBROOT | XFS_ILOG_DEXT |
-                         XFS_ILOG_DEV | XFS_ILOG_UUID)));
-               if (iip->ili_format.ilf_fields & XFS_ILOG_DDATA) {
-                       ASSERT(ip->i_df.if_bytes > 0);
+               iip->ili_fields &=
+                       ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT |
+                         XFS_ILOG_DEV | XFS_ILOG_UUID);
+               if ((iip->ili_fields & XFS_ILOG_DDATA) &&
+                   ip->i_df.if_bytes > 0) {
                        ASSERT(ip->i_df.if_u1.if_data != NULL);
                        ASSERT(ip->i_d.di_size > 0);
 
@@ -401,24 +309,26 @@ xfs_inode_item_format(
                        vecp++;
                        nvecs++;
                        iip->ili_format.ilf_dsize = (unsigned)data_bytes;
+               } else {
+                       iip->ili_fields &= ~XFS_ILOG_DDATA;
                }
                break;
 
        case XFS_DINODE_FMT_DEV:
-               ASSERT(!(iip->ili_format.ilf_fields &
-                        (XFS_ILOG_DBROOT | XFS_ILOG_DEXT |
-                         XFS_ILOG_DDATA | XFS_ILOG_UUID)));
-               if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) {
+               iip->ili_fields &=
+                       ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
+                         XFS_ILOG_DEXT | XFS_ILOG_UUID);
+               if (iip->ili_fields & XFS_ILOG_DEV) {
                        iip->ili_format.ilf_u.ilfu_rdev =
                                ip->i_df.if_u2.if_rdev;
                }
                break;
 
        case XFS_DINODE_FMT_UUID:
-               ASSERT(!(iip->ili_format.ilf_fields &
-                        (XFS_ILOG_DBROOT | XFS_ILOG_DEXT |
-                         XFS_ILOG_DDATA | XFS_ILOG_DEV)));
-               if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) {
+               iip->ili_fields &=
+                       ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
+                         XFS_ILOG_DEXT | XFS_ILOG_DEV);
+               if (iip->ili_fields & XFS_ILOG_UUID) {
                        iip->ili_format.ilf_u.ilfu_uuid =
                                ip->i_df.if_u2.if_uuid;
                }
@@ -430,31 +340,25 @@ xfs_inode_item_format(
        }
 
        /*
-        * If there are no attributes associated with the file,
-        * then we're done.
-        * Assert that no attribute-related log flags are set.
+        * If there are no attributes associated with the file, then we're done.
         */
        if (!XFS_IFORK_Q(ip)) {
-               iip->ili_format.ilf_size = nvecs;
-               ASSERT(!(iip->ili_format.ilf_fields &
-                        (XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT)));
-               return;
+               iip->ili_fields &=
+                       ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT);
+               goto out;
        }
 
        switch (ip->i_d.di_aformat) {
        case XFS_DINODE_FMT_EXTENTS:
-               ASSERT(!(iip->ili_format.ilf_fields &
-                        (XFS_ILOG_ADATA | XFS_ILOG_ABROOT)));
-               if (iip->ili_format.ilf_fields & XFS_ILOG_AEXT) {
-#ifdef DEBUG
-                       int nrecs = ip->i_afp->if_bytes /
-                               (uint)sizeof(xfs_bmbt_rec_t);
-                       ASSERT(nrecs > 0);
-                       ASSERT(nrecs == ip->i_d.di_anextents);
-                       ASSERT(ip->i_afp->if_bytes > 0);
+               iip->ili_fields &=
+                       ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT);
+
+               if ((iip->ili_fields & XFS_ILOG_AEXT) &&
+                   ip->i_d.di_anextents > 0 &&
+                   ip->i_afp->if_bytes > 0) {
+                       ASSERT(ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) ==
+                               ip->i_d.di_anextents);
                        ASSERT(ip->i_afp->if_u1.if_extents != NULL);
-                       ASSERT(ip->i_d.di_anextents > 0);
-#endif
 #ifdef XFS_NATIVE_HOST
                        /*
                         * There are not delayed allocation extents
@@ -471,29 +375,36 @@ xfs_inode_item_format(
                        iip->ili_format.ilf_asize = vecp->i_len;
                        vecp++;
                        nvecs++;
+               } else {
+                       iip->ili_fields &= ~XFS_ILOG_AEXT;
                }
                break;
 
        case XFS_DINODE_FMT_BTREE:
-               ASSERT(!(iip->ili_format.ilf_fields &
-                        (XFS_ILOG_ADATA | XFS_ILOG_AEXT)));
-               if (iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) {
-                       ASSERT(ip->i_afp->if_broot_bytes > 0);
+               iip->ili_fields &=
+                       ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT);
+
+               if ((iip->ili_fields & XFS_ILOG_ABROOT) &&
+                   ip->i_afp->if_broot_bytes > 0) {
                        ASSERT(ip->i_afp->if_broot != NULL);
+
                        vecp->i_addr = ip->i_afp->if_broot;
                        vecp->i_len = ip->i_afp->if_broot_bytes;
                        vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT;
                        vecp++;
                        nvecs++;
                        iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes;
+               } else {
+                       iip->ili_fields &= ~XFS_ILOG_ABROOT;
                }
                break;
 
        case XFS_DINODE_FMT_LOCAL:
-               ASSERT(!(iip->ili_format.ilf_fields &
-                        (XFS_ILOG_ABROOT | XFS_ILOG_AEXT)));
-               if (iip->ili_format.ilf_fields & XFS_ILOG_ADATA) {
-                       ASSERT(ip->i_afp->if_bytes > 0);
+               iip->ili_fields &=
+                       ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT);
+
+               if ((iip->ili_fields & XFS_ILOG_ADATA) &&
+                   ip->i_afp->if_bytes > 0) {
                        ASSERT(ip->i_afp->if_u1.if_data != NULL);
 
                        vecp->i_addr = ip->i_afp->if_u1.if_data;
@@ -510,6 +421,8 @@ xfs_inode_item_format(
                        vecp++;
                        nvecs++;
                        iip->ili_format.ilf_asize = (unsigned)data_bytes;
+               } else {
+                       iip->ili_fields &= ~XFS_ILOG_ADATA;
                }
                break;
 
@@ -518,6 +431,15 @@ xfs_inode_item_format(
                break;
        }
 
+out:
+       /*
+        * Now update the log format that goes out to disk from the in-core
+        * values.  We always write the inode core to make the arithmetic
+        * games in recovery easier, which isn't a big deal as just about any
+        * transaction would dirty it anyway.
+        */
+       iip->ili_format.ilf_fields = XFS_ILOG_CORE |
+               (iip->ili_fields & ~XFS_ILOG_TIMESTAMP);
        iip->ili_format.ilf_size = nvecs;
 }
 
@@ -596,17 +518,13 @@ xfs_inode_item_trylock(
        /* Stale items should force out the iclog */
        if (ip->i_flags & XFS_ISTALE) {
                xfs_ifunlock(ip);
-               /*
-                * we hold the AIL lock - notify the unlock routine of this
-                * so it doesn't try to get the lock again.
-                */
-               xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY);
+               xfs_iunlock(ip, XFS_ILOCK_SHARED);
                return XFS_ITEM_PINNED;
        }
 
 #ifdef DEBUG
        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-               ASSERT(iip->ili_format.ilf_fields != 0);
+               ASSERT(iip->ili_fields != 0);
                ASSERT(iip->ili_logged == 0);
                ASSERT(lip->li_flags & XFS_LI_IN_AIL);
        }
@@ -638,7 +556,7 @@ xfs_inode_item_unlock(
        if (iip->ili_extents_buf != NULL) {
                ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS);
                ASSERT(ip->i_d.di_nextents > 0);
-               ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_DEXT);
+               ASSERT(iip->ili_fields & XFS_ILOG_DEXT);
                ASSERT(ip->i_df.if_bytes > 0);
                kmem_free(iip->ili_extents_buf);
                iip->ili_extents_buf = NULL;
@@ -646,7 +564,7 @@ xfs_inode_item_unlock(
        if (iip->ili_aextents_buf != NULL) {
                ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS);
                ASSERT(ip->i_d.di_anextents > 0);
-               ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_AEXT);
+               ASSERT(iip->ili_fields & XFS_ILOG_AEXT);
                ASSERT(ip->i_afp->if_bytes > 0);
                kmem_free(iip->ili_aextents_buf);
                iip->ili_aextents_buf = NULL;
@@ -761,8 +679,7 @@ xfs_inode_item_push(
         * lock without sleeping, then there must not have been
         * anyone in the process of flushing the inode.
         */
-       ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) ||
-              iip->ili_format.ilf_fields != 0);
+       ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || iip->ili_fields != 0);
 
        /*
         * Push the inode to it's backing buffer. This will not remove the
@@ -985,7 +902,7 @@ xfs_iflush_abort(
                 * Clear the inode logging fields so no more flushes are
                 * attempted.
                 */
-               iip->ili_format.ilf_fields = 0;
+               iip->ili_fields = 0;
        }
        /*
         * Release the inode's flush lock since we're done with it.
index d3dee61e6d91fde1671157b3ee2deeee8df49fcd..41d61c3b7a36e249028d34c505e941636f5c9e68 100644 (file)
@@ -86,6 +86,15 @@ typedef struct xfs_inode_log_format_64 {
 #define        XFS_ILOG_AEXT   0x080   /* log i_af.if_extents */
 #define        XFS_ILOG_ABROOT 0x100   /* log i_af.i_broot */
 
+
+/*
+ * The timestamps are dirty, but not necessarily anything else in the inode
+ * core.  Unlike the other fields above this one must never make it to disk
+ * in the ilf_fields of the inode_log_format, but is purely store in-memory in
+ * ili_fields in the inode_log_item.
+ */
+#define XFS_ILOG_TIMESTAMP     0x4000
+
 #define        XFS_ILOG_NONCORE        (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
                                 XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
                                 XFS_ILOG_UUID | XFS_ILOG_ADATA | \
@@ -101,7 +110,7 @@ typedef struct xfs_inode_log_format_64 {
                                 XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \
                                 XFS_ILOG_DEV | XFS_ILOG_UUID | \
                                 XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
-                                XFS_ILOG_ABROOT)
+                                XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP)
 
 static inline int xfs_ilog_fbroot(int w)
 {
@@ -134,6 +143,7 @@ typedef struct xfs_inode_log_item {
        unsigned short          ili_lock_flags;    /* lock flags */
        unsigned short          ili_logged;        /* flushed logged data */
        unsigned int            ili_last_fields;   /* fields when flushed */
+       unsigned int            ili_fields;        /* fields to be logged */
        struct xfs_bmbt_rec     *ili_extents_buf;  /* array of logged
                                                      data exts */
        struct xfs_bmbt_rec     *ili_aextents_buf; /* array of logged
@@ -148,9 +158,7 @@ typedef struct xfs_inode_log_item {
 
 static inline int xfs_inode_clean(xfs_inode_t *ip)
 {
-       return (!ip->i_itemp ||
-               !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
-              !ip->i_update_core;
+       return !ip->i_itemp || !(ip->i_itemp->ili_fields & XFS_ILOG_ALL);
 }
 
 extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
index 76f3ca5cfc361f962fa40a72243a525f191a618f..f588320dc4b9070a6d2c65244a9a7ed9ac1cc462 100644 (file)
@@ -450,9 +450,12 @@ xfs_attrmulti_attr_get(
 
        if (*len > XATTR_SIZE_MAX)
                return EINVAL;
-       kbuf = kmalloc(*len, GFP_KERNEL);
-       if (!kbuf)
-               return ENOMEM;
+       kbuf = kmem_zalloc(*len, KM_SLEEP | KM_MAYFAIL);
+       if (!kbuf) {
+               kbuf = kmem_zalloc_large(*len);
+               if (!kbuf)
+                       return ENOMEM;
+       }
 
        error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags);
        if (error)
@@ -462,7 +465,10 @@ xfs_attrmulti_attr_get(
                error = EFAULT;
 
  out_kfree:
-       kfree(kbuf);
+       if (is_vmalloc_addr(kbuf))
+               kmem_free_large(kbuf);
+       else
+               kmem_free(kbuf);
        return error;
 }
 
index f9ccb7b7c043bc0d9524fae8f3afa59975bf32ce..a849a5473aff41ec61dcb0770cc3251fab318206 100644 (file)
@@ -293,7 +293,7 @@ xfs_compat_ioc_bulkstat(
                int res;
 
                error = xfs_bulkstat_one_compat(mp, inlast, bulkreq.ubuffer,
-                               sizeof(compat_xfs_bstat_t), 0, &res);
+                               sizeof(compat_xfs_bstat_t), NULL, &res);
        } else if (cmd == XFS_IOC_FSBULKSTAT_32) {
                error = xfs_bulkstat(mp, &inlast, &count,
                        xfs_bulkstat_one_compat, sizeof(compat_xfs_bstat_t),
index 246c7d57c6f96c876778128e8d21c90fca692ce9..71a464503c43837c181b7fb163aebdd612c0d18d 100644 (file)
@@ -31,6 +31,7 @@
 #include "xfs_ialloc_btree.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_inode_item.h"
 #include "xfs_btree.h"
 #include "xfs_bmap.h"
 #include "xfs_rtalloc.h"
@@ -645,6 +646,7 @@ xfs_iomap_write_unwritten(
        xfs_trans_t     *tp;
        xfs_bmbt_irec_t imap;
        xfs_bmap_free_t free_list;
+       xfs_fsize_t     i_size;
        uint            resblks;
        int             committed;
        int             error;
@@ -705,7 +707,22 @@ xfs_iomap_write_unwritten(
                if (error)
                        goto error_on_bmapi_transaction;
 
-               error = xfs_bmap_finish(&(tp), &(free_list), &committed);
+               /*
+                * Log the updated inode size as we go.  We have to be careful
+                * to only log it up to the actual write offset if it is
+                * halfway into a block.
+                */
+               i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb);
+               if (i_size > offset + count)
+                       i_size = offset + count;
+
+               i_size = xfs_new_eof(ip, i_size);
+               if (i_size) {
+                       ip->i_d.di_size = i_size;
+                       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+               }
+
+               error = xfs_bmap_finish(&tp, &free_list, &committed);
                if (error)
                        goto error_on_bmapi_transaction;
 
index ab302539e5b9603b8a67bb9f4399c03625fe1fd7..3011b879f850c2a91b94aafa011bf4006d9ab41e 100644 (file)
 #include <linux/fiemap.h>
 #include <linux/slab.h>
 
-/*
- * Bring the timestamps in the XFS inode uptodate.
- *
- * Used before writing the inode to disk.
- */
-void
-xfs_synchronize_times(
-       xfs_inode_t     *ip)
-{
-       struct inode    *inode = VFS_I(ip);
-
-       ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
-       ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
-       ip->i_d.di_ctime.t_sec = (__int32_t)inode->i_ctime.tv_sec;
-       ip->i_d.di_ctime.t_nsec = (__int32_t)inode->i_ctime.tv_nsec;
-       ip->i_d.di_mtime.t_sec = (__int32_t)inode->i_mtime.tv_sec;
-       ip->i_d.di_mtime.t_nsec = (__int32_t)inode->i_mtime.tv_nsec;
-}
-
-/*
- * If the linux inode is valid, mark it dirty, else mark the dirty state
- * in the XFS inode to make sure we pick it up when reclaiming the inode.
- */
-void
-xfs_mark_inode_dirty_sync(
-       xfs_inode_t     *ip)
-{
-       struct inode    *inode = VFS_I(ip);
-
-       if (!(inode->i_state & (I_WILL_FREE|I_FREEING)))
-               mark_inode_dirty_sync(inode);
-       else {
-               barrier();
-               ip->i_update_core = 1;
-       }
-}
-
-void
-xfs_mark_inode_dirty(
-       xfs_inode_t     *ip)
-{
-       struct inode    *inode = VFS_I(ip);
-
-       if (!(inode->i_state & (I_WILL_FREE|I_FREEING)))
-               mark_inode_dirty(inode);
-       else {
-               barrier();
-               ip->i_update_core = 1;
-       }
-
-}
-
-
-int xfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
-                  void *fs_info)
+static int
+xfs_initxattrs(
+       struct inode            *inode,
+       const struct xattr      *xattr_array,
+       void                    *fs_info)
 {
-       const struct xattr *xattr;
-       struct xfs_inode *ip = XFS_I(inode);
-       int error = 0;
+       const struct xattr      *xattr;
+       struct xfs_inode        *ip = XFS_I(inode);
+       int                     error = 0;
 
        for (xattr = xattr_array; xattr->name != NULL; xattr++) {
                error = xfs_attr_set(ip, xattr->name, xattr->value,
@@ -678,19 +628,16 @@ xfs_setattr_nonsize(
                inode->i_atime = iattr->ia_atime;
                ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
                ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
-               ip->i_update_core = 1;
        }
        if (mask & ATTR_CTIME) {
                inode->i_ctime = iattr->ia_ctime;
                ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
                ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
-               ip->i_update_core = 1;
        }
        if (mask & ATTR_MTIME) {
                inode->i_mtime = iattr->ia_mtime;
                ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
                ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
-               ip->i_update_core = 1;
        }
 
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -918,13 +865,11 @@ xfs_setattr_size(
                inode->i_ctime = iattr->ia_ctime;
                ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
                ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
-               ip->i_update_core = 1;
        }
        if (mask & ATTR_MTIME) {
                inode->i_mtime = iattr->ia_mtime;
                ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
                ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
-               ip->i_update_core = 1;
        }
 
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
index 751e94fe1f77e2694790da48952cf03f30a554e4..9720c54bbed0dd4a37e9ebc6bbb33bf9ee8fdbf7 100644 (file)
@@ -62,7 +62,6 @@ xfs_bulkstat_one_int(
 {
        struct xfs_icdinode     *dic;           /* dinode core info pointer */
        struct xfs_inode        *ip;            /* incore inode pointer */
-       struct inode            *inode;
        struct xfs_bstat        *buf;           /* return buffer */
        int                     error = 0;      /* error value */
 
@@ -86,7 +85,6 @@ xfs_bulkstat_one_int(
        ASSERT(ip->i_imap.im_blkno != 0);
 
        dic = &ip->i_d;
-       inode = VFS_I(ip);
 
        /* xfs_iget returns the following without needing
         * further change.
@@ -99,19 +97,12 @@ xfs_bulkstat_one_int(
        buf->bs_uid = dic->di_uid;
        buf->bs_gid = dic->di_gid;
        buf->bs_size = dic->di_size;
-
-       /*
-        * We need to read the timestamps from the Linux inode because
-        * the VFS keeps writing directly into the inode structure instead
-        * of telling us about the updates.
-        */
-       buf->bs_atime.tv_sec = inode->i_atime.tv_sec;
-       buf->bs_atime.tv_nsec = inode->i_atime.tv_nsec;
-       buf->bs_mtime.tv_sec = inode->i_mtime.tv_sec;
-       buf->bs_mtime.tv_nsec = inode->i_mtime.tv_nsec;
-       buf->bs_ctime.tv_sec = inode->i_ctime.tv_sec;
-       buf->bs_ctime.tv_nsec = inode->i_ctime.tv_nsec;
-
+       buf->bs_atime.tv_sec = dic->di_atime.t_sec;
+       buf->bs_atime.tv_nsec = dic->di_atime.t_nsec;
+       buf->bs_mtime.tv_sec = dic->di_mtime.t_sec;
+       buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec;
+       buf->bs_ctime.tv_sec = dic->di_ctime.t_sec;
+       buf->bs_ctime.tv_nsec = dic->di_ctime.t_nsec;
        buf->bs_xflags = xfs_ip2xflags(ip);
        buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog;
        buf->bs_extents = dic->di_nextents;
index e2cc3568c2998ccc45be81b5381c1bf9f91fcf9f..98a9cb5ffd1700995e8118e71524ba6d12e12fa2 100644 (file)
@@ -67,15 +67,10 @@ STATIC void xlog_state_switch_iclogs(xlog_t         *log,
                                     int                eventual_size);
 STATIC void xlog_state_want_sync(xlog_t        *log, xlog_in_core_t *iclog);
 
-/* local functions to manipulate grant head */
-STATIC int  xlog_grant_log_space(xlog_t                *log,
-                                xlog_ticket_t  *xtic);
 STATIC void xlog_grant_push_ail(struct log     *log,
                                int             need_bytes);
 STATIC void xlog_regrant_reserve_log_space(xlog_t       *log,
                                           xlog_ticket_t *ticket);
-STATIC int xlog_regrant_write_log_space(xlog_t         *log,
-                                        xlog_ticket_t  *ticket);
 STATIC void xlog_ungrant_log_space(xlog_t       *log,
                                   xlog_ticket_t *ticket);
 
@@ -150,78 +145,93 @@ xlog_grant_add_space(
        } while (head_val != old);
 }
 
-STATIC bool
-xlog_reserveq_wake(
-       struct log              *log,
-       int                     *free_bytes)
+STATIC void
+xlog_grant_head_init(
+       struct xlog_grant_head  *head)
+{
+       xlog_assign_grant_head(&head->grant, 1, 0);
+       INIT_LIST_HEAD(&head->waiters);
+       spin_lock_init(&head->lock);
+}
+
+STATIC void
+xlog_grant_head_wake_all(
+       struct xlog_grant_head  *head)
 {
        struct xlog_ticket      *tic;
-       int                     need_bytes;
 
-       list_for_each_entry(tic, &log->l_reserveq, t_queue) {
+       spin_lock(&head->lock);
+       list_for_each_entry(tic, &head->waiters, t_queue)
+               wake_up_process(tic->t_task);
+       spin_unlock(&head->lock);
+}
+
+static inline int
+xlog_ticket_reservation(
+       struct log              *log,
+       struct xlog_grant_head  *head,
+       struct xlog_ticket      *tic)
+{
+       if (head == &log->l_write_head) {
+               ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
+               return tic->t_unit_res;
+       } else {
                if (tic->t_flags & XLOG_TIC_PERM_RESERV)
-                       need_bytes = tic->t_unit_res * tic->t_cnt;
+                       return tic->t_unit_res * tic->t_cnt;
                else
-                       need_bytes = tic->t_unit_res;
-
-               if (*free_bytes < need_bytes)
-                       return false;
-               *free_bytes -= need_bytes;
-
-               trace_xfs_log_grant_wake_up(log, tic);
-               wake_up(&tic->t_wait);
+                       return tic->t_unit_res;
        }
-
-       return true;
 }
 
 STATIC bool
-xlog_writeq_wake(
+xlog_grant_head_wake(
        struct log              *log,
+       struct xlog_grant_head  *head,
        int                     *free_bytes)
 {
        struct xlog_ticket      *tic;
        int                     need_bytes;
 
-       list_for_each_entry(tic, &log->l_writeq, t_queue) {
-               ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
-
-               need_bytes = tic->t_unit_res;
-
+       list_for_each_entry(tic, &head->waiters, t_queue) {
+               need_bytes = xlog_ticket_reservation(log, head, tic);
                if (*free_bytes < need_bytes)
                        return false;
-               *free_bytes -= need_bytes;
 
-               trace_xfs_log_regrant_write_wake_up(log, tic);
-               wake_up(&tic->t_wait);
+               *free_bytes -= need_bytes;
+               trace_xfs_log_grant_wake_up(log, tic);
+               wake_up_process(tic->t_task);
        }
 
        return true;
 }
 
 STATIC int
-xlog_reserveq_wait(
+xlog_grant_head_wait(
        struct log              *log,
+       struct xlog_grant_head  *head,
        struct xlog_ticket      *tic,
        int                     need_bytes)
 {
-       list_add_tail(&tic->t_queue, &log->l_reserveq);
+       list_add_tail(&tic->t_queue, &head->waiters);
 
        do {
                if (XLOG_FORCED_SHUTDOWN(log))
                        goto shutdown;
                xlog_grant_push_ail(log, need_bytes);
 
+               __set_current_state(TASK_UNINTERRUPTIBLE);
+               spin_unlock(&head->lock);
+
                XFS_STATS_INC(xs_sleep_logspace);
-               trace_xfs_log_grant_sleep(log, tic);
 
-               xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
+               trace_xfs_log_grant_sleep(log, tic);
+               schedule();
                trace_xfs_log_grant_wake(log, tic);
 
-               spin_lock(&log->l_grant_reserve_lock);
+               spin_lock(&head->lock);
                if (XLOG_FORCED_SHUTDOWN(log))
                        goto shutdown;
-       } while (xlog_space_left(log, &log->l_grant_reserve_head) < need_bytes);
+       } while (xlog_space_left(log, &head->grant) < need_bytes);
 
        list_del_init(&tic->t_queue);
        return 0;
@@ -230,35 +240,58 @@ shutdown:
        return XFS_ERROR(EIO);
 }
 
+/*
+ * Atomically get the log space required for a log ticket.
+ *
+ * Once a ticket gets put onto head->waiters, it will only return after the
+ * needed reservation is satisfied.
+ *
+ * This function is structured so that it has a lock free fast path. This is
+ * necessary because every new transaction reservation will come through this
+ * path. Hence any lock will be globally hot if we take it unconditionally on
+ * every pass.
+ *
+ * As tickets are only ever moved on and off head->waiters under head->lock, we
+ * only need to take that lock if we are going to add the ticket to the queue
+ * and sleep. We can avoid taking the lock if the ticket was never added to
+ * head->waiters because the t_queue list head will be empty and we hold the
+ * only reference to it so it can safely be checked unlocked.
+ */
 STATIC int
-xlog_writeq_wait(
+xlog_grant_head_check(
        struct log              *log,
+       struct xlog_grant_head  *head,
        struct xlog_ticket      *tic,
-       int                     need_bytes)
+       int                     *need_bytes)
 {
-       list_add_tail(&tic->t_queue, &log->l_writeq);
-
-       do {
-               if (XLOG_FORCED_SHUTDOWN(log))
-                       goto shutdown;
-               xlog_grant_push_ail(log, need_bytes);
-
-               XFS_STATS_INC(xs_sleep_logspace);
-               trace_xfs_log_regrant_write_sleep(log, tic);
+       int                     free_bytes;
+       int                     error = 0;
 
-               xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
-               trace_xfs_log_regrant_write_wake(log, tic);
+       ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
 
-               spin_lock(&log->l_grant_write_lock);
-               if (XLOG_FORCED_SHUTDOWN(log))
-                       goto shutdown;
-       } while (xlog_space_left(log, &log->l_grant_write_head) < need_bytes);
+       /*
+        * If there are other waiters on the queue then give them a chance at
+        * logspace before us.  Wake up the first waiters, if we do not wake
+        * up all the waiters then go to sleep waiting for more free space,
+        * otherwise try to get some space for this transaction.
+        */
+       *need_bytes = xlog_ticket_reservation(log, head, tic);
+       free_bytes = xlog_space_left(log, &head->grant);
+       if (!list_empty_careful(&head->waiters)) {
+               spin_lock(&head->lock);
+               if (!xlog_grant_head_wake(log, head, &free_bytes) ||
+                   free_bytes < *need_bytes) {
+                       error = xlog_grant_head_wait(log, head, tic,
+                                                    *need_bytes);
+               }
+               spin_unlock(&head->lock);
+       } else if (free_bytes < *need_bytes) {
+               spin_lock(&head->lock);
+               error = xlog_grant_head_wait(log, head, tic, *need_bytes);
+               spin_unlock(&head->lock);
+       }
 
-       list_del_init(&tic->t_queue);
-       return 0;
-shutdown:
-       list_del_init(&tic->t_queue);
-       return XFS_ERROR(EIO);
+       return error;
 }
 
 static void
@@ -285,6 +318,128 @@ xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type)
        tic->t_res_num++;
 }
 
+/*
+ * Replenish the byte reservation required by moving the grant write head.
+ */
+int
+xfs_log_regrant(
+       struct xfs_mount        *mp,
+       struct xlog_ticket      *tic)
+{
+       struct log              *log = mp->m_log;
+       int                     need_bytes;
+       int                     error = 0;
+
+       if (XLOG_FORCED_SHUTDOWN(log))
+               return XFS_ERROR(EIO);
+
+       XFS_STATS_INC(xs_try_logspace);
+
+       /*
+        * This is a new transaction on the ticket, so we need to change the
+        * transaction ID so that the next transaction has a different TID in
+        * the log. Just add one to the existing tid so that we can see chains
+        * of rolling transactions in the log easily.
+        */
+       tic->t_tid++;
+
+       xlog_grant_push_ail(log, tic->t_unit_res);
+
+       tic->t_curr_res = tic->t_unit_res;
+       xlog_tic_reset_res(tic);
+
+       if (tic->t_cnt > 0)
+               return 0;
+
+       trace_xfs_log_regrant(log, tic);
+
+       error = xlog_grant_head_check(log, &log->l_write_head, tic,
+                                     &need_bytes);
+       if (error)
+               goto out_error;
+
+       xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes);
+       trace_xfs_log_regrant_exit(log, tic);
+       xlog_verify_grant_tail(log);
+       return 0;
+
+out_error:
+       /*
+        * If we are failing, make sure the ticket doesn't have any current
+        * reservations.  We don't want to add this back when the ticket/
+        * transaction gets cancelled.
+        */
+       tic->t_curr_res = 0;
+       tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
+       return error;
+}
+
+/*
+ * Reserve log space and return a ticket corresponding the reservation.
+ *
+ * Each reservation is going to reserve extra space for a log record header.
+ * When writes happen to the on-disk log, we don't subtract the length of the
+ * log record header from any reservation.  By wasting space in each
+ * reservation, we prevent over allocation problems.
+ */
+int
+xfs_log_reserve(
+       struct xfs_mount        *mp,
+       int                     unit_bytes,
+       int                     cnt,
+       struct xlog_ticket      **ticp,
+       __uint8_t               client,
+       bool                    permanent,
+       uint                    t_type)
+{
+       struct log              *log = mp->m_log;
+       struct xlog_ticket      *tic;
+       int                     need_bytes;
+       int                     error = 0;
+
+       ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
+
+       if (XLOG_FORCED_SHUTDOWN(log))
+               return XFS_ERROR(EIO);
+
+       XFS_STATS_INC(xs_try_logspace);
+
+       ASSERT(*ticp == NULL);
+       tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent,
+                               KM_SLEEP | KM_MAYFAIL);
+       if (!tic)
+               return XFS_ERROR(ENOMEM);
+
+       tic->t_trans_type = t_type;
+       *ticp = tic;
+
+       xlog_grant_push_ail(log, tic->t_unit_res * tic->t_cnt);
+
+       trace_xfs_log_reserve(log, tic);
+
+       error = xlog_grant_head_check(log, &log->l_reserve_head, tic,
+                                     &need_bytes);
+       if (error)
+               goto out_error;
+
+       xlog_grant_add_space(log, &log->l_reserve_head.grant, need_bytes);
+       xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes);
+       trace_xfs_log_reserve_exit(log, tic);
+       xlog_verify_grant_tail(log);
+       return 0;
+
+out_error:
+       /*
+        * If we are failing, make sure the ticket doesn't have any current
+        * reservations.  We don't want to add this back when the ticket/
+        * transaction gets cancelled.
+        */
+       tic->t_curr_res = 0;
+       tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
+       return error;
+}
+
+
 /*
  * NOTES:
  *
@@ -394,88 +549,6 @@ xfs_log_release_iclog(
        return 0;
 }
 
-/*
- *  1. Reserve an amount of on-disk log space and return a ticket corresponding
- *     to the reservation.
- *  2. Potentially, push buffers at tail of log to disk.
- *
- * Each reservation is going to reserve extra space for a log record header.
- * When writes happen to the on-disk log, we don't subtract the length of the
- * log record header from any reservation.  By wasting space in each
- * reservation, we prevent over allocation problems.
- */
-int
-xfs_log_reserve(
-       struct xfs_mount        *mp,
-       int                     unit_bytes,
-       int                     cnt,
-       struct xlog_ticket      **ticket,
-       __uint8_t               client,
-       uint                    flags,
-       uint                    t_type)
-{
-       struct log              *log = mp->m_log;
-       struct xlog_ticket      *internal_ticket;
-       int                     retval = 0;
-
-       ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
-
-       if (XLOG_FORCED_SHUTDOWN(log))
-               return XFS_ERROR(EIO);
-
-       XFS_STATS_INC(xs_try_logspace);
-
-
-       if (*ticket != NULL) {
-               ASSERT(flags & XFS_LOG_PERM_RESERV);
-               internal_ticket = *ticket;
-
-               /*
-                * this is a new transaction on the ticket, so we need to
-                * change the transaction ID so that the next transaction has a
-                * different TID in the log. Just add one to the existing tid
-                * so that we can see chains of rolling transactions in the log
-                * easily.
-                */
-               internal_ticket->t_tid++;
-
-               trace_xfs_log_reserve(log, internal_ticket);
-
-               xlog_grant_push_ail(log, internal_ticket->t_unit_res);
-               retval = xlog_regrant_write_log_space(log, internal_ticket);
-       } else {
-               /* may sleep if need to allocate more tickets */
-               internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt,
-                                                 client, flags,
-                                                 KM_SLEEP|KM_MAYFAIL);
-               if (!internal_ticket)
-                       return XFS_ERROR(ENOMEM);
-               internal_ticket->t_trans_type = t_type;
-               *ticket = internal_ticket;
-
-               trace_xfs_log_reserve(log, internal_ticket);
-
-               xlog_grant_push_ail(log,
-                                   (internal_ticket->t_unit_res *
-                                    internal_ticket->t_cnt));
-               retval = xlog_grant_log_space(log, internal_ticket);
-       }
-
-       if (unlikely(retval)) {
-               /*
-                * If we are failing, make sure the ticket doesn't have any
-                * current reservations.  We don't want to add this back
-                * when the ticket/ transaction gets cancelled.
-                */
-               internal_ticket->t_curr_res = 0;
-               /* ungrant will give back unit_res * t_cnt. */
-               internal_ticket->t_cnt = 0;
-       }
-
-       return retval;
-}
-
-
 /*
  * Mount a log filesystem
  *
@@ -760,64 +833,35 @@ xfs_log_item_init(
        INIT_LIST_HEAD(&item->li_cil);
 }
 
+/*
+ * Wake up processes waiting for log space after we have moved the log tail.
+ */
 void
-xfs_log_move_tail(xfs_mount_t  *mp,
-                 xfs_lsn_t     tail_lsn)
+xfs_log_space_wake(
+       struct xfs_mount        *mp)
 {
-       xlog_ticket_t   *tic;
-       xlog_t          *log = mp->m_log;
-       int             need_bytes, free_bytes;
+       struct log              *log = mp->m_log;
+       int                     free_bytes;
 
        if (XLOG_FORCED_SHUTDOWN(log))
                return;
 
-       if (tail_lsn == 0)
-               tail_lsn = atomic64_read(&log->l_last_sync_lsn);
-
-       /* tail_lsn == 1 implies that we weren't passed a valid value.  */
-       if (tail_lsn != 1)
-               atomic64_set(&log->l_tail_lsn, tail_lsn);
-
-       if (!list_empty_careful(&log->l_writeq)) {
-#ifdef DEBUG
-               if (log->l_flags & XLOG_ACTIVE_RECOVERY)
-                       panic("Recovery problem");
-#endif
-               spin_lock(&log->l_grant_write_lock);
-               free_bytes = xlog_space_left(log, &log->l_grant_write_head);
-               list_for_each_entry(tic, &log->l_writeq, t_queue) {
-                       ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
+       if (!list_empty_careful(&log->l_write_head.waiters)) {
+               ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
 
-                       if (free_bytes < tic->t_unit_res && tail_lsn != 1)
-                               break;
-                       tail_lsn = 0;
-                       free_bytes -= tic->t_unit_res;
-                       trace_xfs_log_regrant_write_wake_up(log, tic);
-                       wake_up(&tic->t_wait);
-               }
-               spin_unlock(&log->l_grant_write_lock);
+               spin_lock(&log->l_write_head.lock);
+               free_bytes = xlog_space_left(log, &log->l_write_head.grant);
+               xlog_grant_head_wake(log, &log->l_write_head, &free_bytes);
+               spin_unlock(&log->l_write_head.lock);
        }
 
-       if (!list_empty_careful(&log->l_reserveq)) {
-#ifdef DEBUG
-               if (log->l_flags & XLOG_ACTIVE_RECOVERY)
-                       panic("Recovery problem");
-#endif
-               spin_lock(&log->l_grant_reserve_lock);
-               free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
-               list_for_each_entry(tic, &log->l_reserveq, t_queue) {
-                       if (tic->t_flags & XLOG_TIC_PERM_RESERV)
-                               need_bytes = tic->t_unit_res*tic->t_cnt;
-                       else
-                               need_bytes = tic->t_unit_res;
-                       if (free_bytes < need_bytes && tail_lsn != 1)
-                               break;
-                       tail_lsn = 0;
-                       free_bytes -= need_bytes;
-                       trace_xfs_log_grant_wake_up(log, tic);
-                       wake_up(&tic->t_wait);
-               }
-               spin_unlock(&log->l_grant_reserve_lock);
+       if (!list_empty_careful(&log->l_reserve_head.waiters)) {
+               ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
+
+               spin_lock(&log->l_reserve_head.lock);
+               free_bytes = xlog_space_left(log, &log->l_reserve_head.grant);
+               xlog_grant_head_wake(log, &log->l_reserve_head, &free_bytes);
+               spin_unlock(&log->l_reserve_head.lock);
        }
 }
 
@@ -867,21 +911,7 @@ xfs_log_need_covered(xfs_mount_t *mp)
        return needed;
 }
 
-/******************************************************************************
- *
- *     local routines
- *
- ******************************************************************************
- */
-
-/* xfs_trans_tail_ail returns 0 when there is nothing in the list.
- * The log manager must keep track of the last LR which was committed
- * to disk.  The lsn of this LR will become the new tail_lsn whenever
- * xfs_trans_tail_ail returns 0.  If we don't do this, we run into
- * the situation where stuff could be written into the log but nothing
- * was ever in the AIL when asked.  Eventually, we panic since the
- * tail hits the head.
- *
+/*
  * We may be holding the log iclog lock upon entering this routine.
  */
 xfs_lsn_t
@@ -891,10 +921,17 @@ xlog_assign_tail_lsn(
        xfs_lsn_t               tail_lsn;
        struct log              *log = mp->m_log;
 
+       /*
+        * To make sure we always have a valid LSN for the log tail we keep
+        * track of the last LSN which was committed in log->l_last_sync_lsn,
+        * and use that when the AIL was empty and xfs_ail_min_lsn returns 0.
+        *
+        * If the AIL has been emptied we also need to wake any process
+        * waiting for this condition.
+        */
        tail_lsn = xfs_ail_min_lsn(mp->m_ail);
        if (!tail_lsn)
                tail_lsn = atomic64_read(&log->l_last_sync_lsn);
-
        atomic64_set(&log->l_tail_lsn, tail_lsn);
        return tail_lsn;
 }
@@ -1100,12 +1137,9 @@ xlog_alloc_log(xfs_mount_t       *mp,
        xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0);
        xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0);
        log->l_curr_cycle  = 1;     /* 0 is bad since this is initial value */
-       xlog_assign_grant_head(&log->l_grant_reserve_head, 1, 0);
-       xlog_assign_grant_head(&log->l_grant_write_head, 1, 0);
-       INIT_LIST_HEAD(&log->l_reserveq);
-       INIT_LIST_HEAD(&log->l_writeq);
-       spin_lock_init(&log->l_grant_reserve_lock);
-       spin_lock_init(&log->l_grant_write_lock);
+
+       xlog_grant_head_init(&log->l_reserve_head);
+       xlog_grant_head_init(&log->l_write_head);
 
        error = EFSCORRUPTED;
        if (xfs_sb_version_hassector(&mp->m_sb)) {
@@ -1280,7 +1314,7 @@ xlog_grant_push_ail(
 
        ASSERT(BTOBB(need_bytes) < log->l_logBBsize);
 
-       free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
+       free_bytes = xlog_space_left(log, &log->l_reserve_head.grant);
        free_blocks = BTOBBT(free_bytes);
 
        /*
@@ -1412,8 +1446,8 @@ xlog_sync(xlog_t          *log,
                 roundoff < BBTOB(1)));
 
        /* move grant heads by roundoff in sync */
-       xlog_grant_add_space(log, &log->l_grant_reserve_head, roundoff);
-       xlog_grant_add_space(log, &log->l_grant_write_head, roundoff);
+       xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff);
+       xlog_grant_add_space(log, &log->l_write_head.grant, roundoff);
 
        /* put cycle number in every block */
        xlog_pack_data(log, iclog, roundoff); 
@@ -2566,119 +2600,6 @@ restart:
        return 0;
 }      /* xlog_state_get_iclog_space */
 
-/*
- * Atomically get the log space required for a log ticket.
- *
- * Once a ticket gets put onto the reserveq, it will only return after the
- * needed reservation is satisfied.
- *
- * This function is structured so that it has a lock free fast path. This is
- * necessary because every new transaction reservation will come through this
- * path. Hence any lock will be globally hot if we take it unconditionally on
- * every pass.
- *
- * As tickets are only ever moved on and off the reserveq under the
- * l_grant_reserve_lock, we only need to take that lock if we are going to add
- * the ticket to the queue and sleep. We can avoid taking the lock if the ticket
- * was never added to the reserveq because the t_queue list head will be empty
- * and we hold the only reference to it so it can safely be checked unlocked.
- */
-STATIC int
-xlog_grant_log_space(
-       struct log              *log,
-       struct xlog_ticket      *tic)
-{
-       int                     free_bytes, need_bytes;
-       int                     error = 0;
-
-       ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
-
-       trace_xfs_log_grant_enter(log, tic);
-
-       /*
-        * If there are other waiters on the queue then give them a chance at
-        * logspace before us.  Wake up the first waiters, if we do not wake
-        * up all the waiters then go to sleep waiting for more free space,
-        * otherwise try to get some space for this transaction.
-        */
-       need_bytes = tic->t_unit_res;
-       if (tic->t_flags & XFS_LOG_PERM_RESERV)
-               need_bytes *= tic->t_ocnt;
-       free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
-       if (!list_empty_careful(&log->l_reserveq)) {
-               spin_lock(&log->l_grant_reserve_lock);
-               if (!xlog_reserveq_wake(log, &free_bytes) ||
-                   free_bytes < need_bytes)
-                       error = xlog_reserveq_wait(log, tic, need_bytes);
-               spin_unlock(&log->l_grant_reserve_lock);
-       } else if (free_bytes < need_bytes) {
-               spin_lock(&log->l_grant_reserve_lock);
-               error = xlog_reserveq_wait(log, tic, need_bytes);
-               spin_unlock(&log->l_grant_reserve_lock);
-       }
-       if (error)
-               return error;
-
-       xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes);
-       xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
-       trace_xfs_log_grant_exit(log, tic);
-       xlog_verify_grant_tail(log);
-       return 0;
-}
-
-/*
- * Replenish the byte reservation required by moving the grant write head.
- *
- * Similar to xlog_grant_log_space, the function is structured to have a lock
- * free fast path.
- */
-STATIC int
-xlog_regrant_write_log_space(
-       struct log              *log,
-       struct xlog_ticket      *tic)
-{
-       int                     free_bytes, need_bytes;
-       int                     error = 0;
-
-       tic->t_curr_res = tic->t_unit_res;
-       xlog_tic_reset_res(tic);
-
-       if (tic->t_cnt > 0)
-               return 0;
-
-       ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
-
-       trace_xfs_log_regrant_write_enter(log, tic);
-
-       /*
-        * If there are other waiters on the queue then give them a chance at
-        * logspace before us.  Wake up the first waiters, if we do not wake
-        * up all the waiters then go to sleep waiting for more free space,
-        * otherwise try to get some space for this transaction.
-        */
-       need_bytes = tic->t_unit_res;
-       free_bytes = xlog_space_left(log, &log->l_grant_write_head);
-       if (!list_empty_careful(&log->l_writeq)) {
-               spin_lock(&log->l_grant_write_lock);
-               if (!xlog_writeq_wake(log, &free_bytes) ||
-                   free_bytes < need_bytes)
-                       error = xlog_writeq_wait(log, tic, need_bytes);
-               spin_unlock(&log->l_grant_write_lock);
-       } else if (free_bytes < need_bytes) {
-               spin_lock(&log->l_grant_write_lock);
-               error = xlog_writeq_wait(log, tic, need_bytes);
-               spin_unlock(&log->l_grant_write_lock);
-       }
-
-       if (error)
-               return error;
-
-       xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
-       trace_xfs_log_regrant_write_exit(log, tic);
-       xlog_verify_grant_tail(log);
-       return 0;
-}
-
 /* The first cnt-1 times through here we don't need to
  * move the grant write head because the permanent
  * reservation has reserved cnt times the unit amount.
@@ -2695,9 +2616,9 @@ xlog_regrant_reserve_log_space(xlog_t          *log,
        if (ticket->t_cnt > 0)
                ticket->t_cnt--;
 
-       xlog_grant_sub_space(log, &log->l_grant_reserve_head,
+       xlog_grant_sub_space(log, &log->l_reserve_head.grant,
                                        ticket->t_curr_res);
-       xlog_grant_sub_space(log, &log->l_grant_write_head,
+       xlog_grant_sub_space(log, &log->l_write_head.grant,
                                        ticket->t_curr_res);
        ticket->t_curr_res = ticket->t_unit_res;
        xlog_tic_reset_res(ticket);
@@ -2708,7 +2629,7 @@ xlog_regrant_reserve_log_space(xlog_t          *log,
        if (ticket->t_cnt > 0)
                return;
 
-       xlog_grant_add_space(log, &log->l_grant_reserve_head,
+       xlog_grant_add_space(log, &log->l_reserve_head.grant,
                                        ticket->t_unit_res);
 
        trace_xfs_log_regrant_reserve_exit(log, ticket);
@@ -2754,14 +2675,13 @@ xlog_ungrant_log_space(xlog_t        *log,
                bytes += ticket->t_unit_res*ticket->t_cnt;
        }
 
-       xlog_grant_sub_space(log, &log->l_grant_reserve_head, bytes);
-       xlog_grant_sub_space(log, &log->l_grant_write_head, bytes);
+       xlog_grant_sub_space(log, &log->l_reserve_head.grant, bytes);
+       xlog_grant_sub_space(log, &log->l_write_head.grant, bytes);
 
        trace_xfs_log_ungrant_exit(log, ticket);
 
-       xfs_log_move_tail(log->l_mp, 1);
-}      /* xlog_ungrant_log_space */
-
+       xfs_log_space_wake(log->l_mp);
+}
 
 /*
  * Flush iclog to disk if this is the last reference to the given iclog and
@@ -3219,7 +3139,7 @@ xlog_ticket_alloc(
        int             unit_bytes,
        int             cnt,
        char            client,
-       uint            xflags,
+       bool            permanent,
        int             alloc_flags)
 {
        struct xlog_ticket *tic;
@@ -3313,6 +3233,7 @@ xlog_ticket_alloc(
         }
 
        atomic_set(&tic->t_ref, 1);
+       tic->t_task             = current;
        INIT_LIST_HEAD(&tic->t_queue);
        tic->t_unit_res         = unit_bytes;
        tic->t_curr_res         = unit_bytes;
@@ -3322,9 +3243,8 @@ xlog_ticket_alloc(
        tic->t_clientid         = client;
        tic->t_flags            = XLOG_TIC_INITED;
        tic->t_trans_type       = 0;
-       if (xflags & XFS_LOG_PERM_RESERV)
+       if (permanent)
                tic->t_flags |= XLOG_TIC_PERM_RESERV;
-       init_waitqueue_head(&tic->t_wait);
 
        xlog_tic_reset_res(tic);
 
@@ -3380,7 +3300,7 @@ xlog_verify_grant_tail(
        int             tail_cycle, tail_blocks;
        int             cycle, space;
 
-       xlog_crack_grant_head(&log->l_grant_write_head, &cycle, &space);
+       xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &space);
        xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks);
        if (tail_cycle != cycle) {
                if (cycle - 1 != tail_cycle &&
@@ -3582,7 +3502,6 @@ xfs_log_force_umount(
        struct xfs_mount        *mp,
        int                     logerror)
 {
-       xlog_ticket_t   *tic;
        xlog_t          *log;
        int             retval;
 
@@ -3650,15 +3569,8 @@ xfs_log_force_umount(
         * we don't enqueue anything once the SHUTDOWN flag is set, and this
         * action is protected by the grant locks.
         */
-       spin_lock(&log->l_grant_reserve_lock);
-       list_for_each_entry(tic, &log->l_reserveq, t_queue)
-               wake_up(&tic->t_wait);
-       spin_unlock(&log->l_grant_reserve_lock);
-
-       spin_lock(&log->l_grant_write_lock);
-       list_for_each_entry(tic, &log->l_writeq, t_queue)
-               wake_up(&tic->t_wait);
-       spin_unlock(&log->l_grant_write_lock);
+       xlog_grant_head_wake_all(&log->l_reserve_head);
+       xlog_grant_head_wake_all(&log->l_write_head);
 
        if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
                ASSERT(!logerror);
index 2aee3b22d29c26d9d11b92f9e1531cab1c2540e3..2c622bedb3021c72f9c3dde56aeab1969549f350 100644 (file)
@@ -52,15 +52,6 @@ static inline xfs_lsn_t      _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
  */
 #define XFS_LOG_REL_PERM_RESERV        0x1
 
-/*
- * Flags to xfs_log_reserve()
- *
- *     XFS_LOG_PERM_RESERV: Permanent reservation.  When writes are
- *             performed against this type of reservation, the reservation
- *             is not decreased.  Long running transactions should use this.
- */
-#define XFS_LOG_PERM_RESERV    0x2
-
 /*
  * Flags to xfs_log_force()
  *
@@ -160,8 +151,8 @@ int   xfs_log_mount(struct xfs_mount        *mp,
                        xfs_daddr_t             start_block,
                        int                     num_bblocks);
 int      xfs_log_mount_finish(struct xfs_mount *mp);
-void     xfs_log_move_tail(struct xfs_mount    *mp,
-                           xfs_lsn_t           tail_lsn);
+xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
+void     xfs_log_space_wake(struct xfs_mount *mp);
 int      xfs_log_notify(struct xfs_mount       *mp,
                         struct xlog_in_core    *iclog,
                         xfs_log_callback_t     *callback_entry);
@@ -172,8 +163,9 @@ int   xfs_log_reserve(struct xfs_mount *mp,
                          int              count,
                          struct xlog_ticket **ticket,
                          __uint8_t        clientid,
-                         uint             flags,
+                         bool             permanent,
                          uint             t_type);
+int      xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
 int      xfs_log_unmount_write(struct xfs_mount *mp);
 void      xfs_log_unmount(struct xfs_mount *mp);
 int      xfs_log_force_umount(struct xfs_mount *mp, int logerror);
index 2d3b6a498d632752a95109b2d699fe7de36afec7..2152900b79d4059d943ca6cdc94d4d4e502b62cc 100644 (file)
@@ -239,8 +239,8 @@ typedef struct xlog_res {
 } xlog_res_t;
 
 typedef struct xlog_ticket {
-       wait_queue_head_t  t_wait;       /* ticket wait queue */
        struct list_head   t_queue;      /* reserve/write queue */
+       struct task_struct *t_task;      /* task that owns this ticket */
        xlog_tid_t         t_tid;        /* transaction identifier       : 4  */
        atomic_t           t_ref;        /* ticket reference count       : 4  */
        int                t_curr_res;   /* current reservation in bytes : 4  */
@@ -469,6 +469,16 @@ struct xfs_cil {
 #define XLOG_CIL_SPACE_LIMIT(log)      (log->l_logsize >> 3)
 #define XLOG_CIL_HARD_SPACE_LIMIT(log) (3 * (log->l_logsize >> 4))
 
+/*
+ * ticket grant locks, queues and accounting have their own cachlines
+ * as these are quite hot and can be operated on concurrently.
+ */
+struct xlog_grant_head {
+       spinlock_t              lock ____cacheline_aligned_in_smp;
+       struct list_head        waiters;
+       atomic64_t              grant;
+};
+
 /*
  * The reservation head lsn is not made up of a cycle number and block number.
  * Instead, it uses a cycle number and byte number.  Logs don't expect to
@@ -520,17 +530,8 @@ typedef struct log {
        /* lsn of 1st LR with unflushed * buffers */
        atomic64_t              l_tail_lsn ____cacheline_aligned_in_smp;
 
-       /*
-        * ticket grant locks, queues and accounting have their own cachlines
-        * as these are quite hot and can be operated on concurrently.
-        */
-       spinlock_t              l_grant_reserve_lock ____cacheline_aligned_in_smp;
-       struct list_head        l_reserveq;
-       atomic64_t              l_grant_reserve_head;
-
-       spinlock_t              l_grant_write_lock ____cacheline_aligned_in_smp;
-       struct list_head        l_writeq;
-       atomic64_t              l_grant_write_head;
+       struct xlog_grant_head  l_reserve_head;
+       struct xlog_grant_head  l_write_head;
 
        /* The following field are used for debugging; need to hold icloglock */
 #ifdef DEBUG
@@ -545,14 +546,13 @@ typedef struct log {
 #define XLOG_FORCED_SHUTDOWN(log)      ((log)->l_flags & XLOG_IO_ERROR)
 
 /* common routines */
-extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
 extern int      xlog_recover(xlog_t *log);
 extern int      xlog_recover_finish(xlog_t *log);
 extern void     xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
 
 extern kmem_zone_t *xfs_log_ticket_zone;
 struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes,
-                               int count, char client, uint xflags,
+                               int count, char client, bool permanent,
                                int alloc_flags);
 
 
index 0ed9ee77937c50470fdea8b7573ea738e4a1587c..7c75c7374d5a4acfdc1e83e139db95294d35fd58 100644 (file)
@@ -965,9 +965,9 @@ xlog_find_tail(
                log->l_curr_cycle++;
        atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
        atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
-       xlog_assign_grant_head(&log->l_grant_reserve_head, log->l_curr_cycle,
+       xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
                                        BBTOB(log->l_curr_block));
-       xlog_assign_grant_head(&log->l_grant_write_head, log->l_curr_cycle,
+       xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
                                        BBTOB(log->l_curr_block));
 
        /*
@@ -3695,7 +3695,7 @@ xlog_do_recover(
 
        /* Convert superblock from on-disk format */
        sbp = &log->l_mp->m_sb;
-       xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
+       xfs_sb_from_disk(log->l_mp, XFS_BUF_TO_SBP(bp));
        ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
        ASSERT(xfs_sb_good_version(sbp));
        xfs_buf_relse(bp);
index d06afbc3540dde90d7796139bf658c8439a83d4c..1ffead4b2296c947bfbfd3c32a93a10946830028 100644 (file)
@@ -158,7 +158,7 @@ xfs_uuid_mount(
 
  out_duplicate:
        mutex_unlock(&xfs_uuid_table_mutex);
-       xfs_warn(mp, "Filesystem has duplicate UUID - can't mount");
+       xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount", uuid);
        return XFS_ERROR(EINVAL);
 }
 
@@ -553,9 +553,11 @@ out_unwind:
 
 void
 xfs_sb_from_disk(
-       xfs_sb_t        *to,
+       struct xfs_mount        *mp,
        xfs_dsb_t       *from)
 {
+       struct xfs_sb *to = &mp->m_sb;
+
        to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
        to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
        to->sb_dblocks = be64_to_cpu(from->sb_dblocks);
@@ -693,7 +695,7 @@ reread:
         * Initialize the mount structure from the superblock.
         * But first do some basic consistency checking.
         */
-       xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
+       xfs_sb_from_disk(mp, XFS_BUF_TO_SBP(bp));
        error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
        if (error) {
                if (loud)
index 19f69e232509019ffa1ac99381e75cb0279ee908..9eba73887829a89027c0269b1b3cc468343e19e8 100644 (file)
@@ -211,6 +211,9 @@ typedef struct xfs_mount {
        struct shrinker         m_inode_shrink; /* inode reclaim shrinker */
        int64_t                 m_low_space[XFS_LOWSP_MAX];
                                                /* low free space thresholds */
+
+       struct workqueue_struct *m_data_workqueue;
+       struct workqueue_struct *m_unwritten_workqueue;
 } xfs_mount_t;
 
 /*
@@ -395,7 +398,7 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *);
 extern void    xfs_mod_sb(struct xfs_trans *, __int64_t);
 extern int     xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t,
                                        xfs_agnumber_t *);
-extern void    xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
+extern void    xfs_sb_from_disk(struct xfs_mount *, struct xfs_dsb *);
 extern void    xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
 
 #endif /* __XFS_MOUNT_H__ */
index c436def733bf892eb324603f688e6d92b92ffc61..55c6afedc8796a8223cf67f1e2c3073cc555a050 100644 (file)
  * quota functionality, including maintaining the freelist and hash
  * tables of dquots.
  */
-struct mutex   xfs_Gqm_lock;
-struct xfs_qm  *xfs_Gqm;
-
-kmem_zone_t    *qm_dqzone;
-kmem_zone_t    *qm_dqtrxzone;
-
-STATIC void    xfs_qm_list_init(xfs_dqlist_t *, char *, int);
-STATIC void    xfs_qm_list_destroy(xfs_dqlist_t *);
-
 STATIC int     xfs_qm_init_quotainos(xfs_mount_t *);
 STATIC int     xfs_qm_init_quotainfo(xfs_mount_t *);
 STATIC int     xfs_qm_shake(struct shrinker *, struct shrink_control *);
 
-static struct shrinker xfs_qm_shaker = {
-       .shrink = xfs_qm_shake,
-       .seeks = DEFAULT_SEEKS,
-};
-
 /*
- * Initialize the XQM structure.
- * Note that there is not one quota manager per file system.
+ * We use the batch lookup interface to iterate over the dquots as it
+ * currently is the only interface into the radix tree code that allows
+ * fuzzy lookups instead of exact matches.  Holding the lock over multiple
+ * operations is fine as all callers are used either during mount/umount
+ * or quotaoff.
  */
-STATIC struct xfs_qm *
-xfs_Gqm_init(void)
+#define XFS_DQ_LOOKUP_BATCH    32
+
+STATIC int
+xfs_qm_dquot_walk(
+       struct xfs_mount        *mp,
+       int                     type,
+       int                     (*execute)(struct xfs_dquot *dqp))
 {
-       xfs_dqhash_t    *udqhash, *gdqhash;
-       xfs_qm_t        *xqm;
-       size_t          hsize;
-       uint            i;
+       struct xfs_quotainfo    *qi = mp->m_quotainfo;
+       struct radix_tree_root  *tree = XFS_DQUOT_TREE(qi, type);
+       uint32_t                next_index;
+       int                     last_error = 0;
+       int                     skipped;
+       int                     nr_found;
+
+restart:
+       skipped = 0;
+       next_index = 0;
+       nr_found = 0;
+
+       while (1) {
+               struct xfs_dquot *batch[XFS_DQ_LOOKUP_BATCH];
+               int             error = 0;
+               int             i;
+
+               mutex_lock(&qi->qi_tree_lock);
+               nr_found = radix_tree_gang_lookup(tree, (void **)batch,
+                                       next_index, XFS_DQ_LOOKUP_BATCH);
+               if (!nr_found) {
+                       mutex_unlock(&qi->qi_tree_lock);
+                       break;
+               }
 
-       /*
-        * Initialize the dquot hash tables.
-        */
-       udqhash = kmem_zalloc_greedy(&hsize,
-                                    XFS_QM_HASHSIZE_LOW * sizeof(xfs_dqhash_t),
-                                    XFS_QM_HASHSIZE_HIGH * sizeof(xfs_dqhash_t));
-       if (!udqhash)
-               goto out;
+               for (i = 0; i < nr_found; i++) {
+                       struct xfs_dquot *dqp = batch[i];
 
-       gdqhash = kmem_zalloc_large(hsize);
-       if (!gdqhash)
-               goto out_free_udqhash;
+                       next_index = be32_to_cpu(dqp->q_core.d_id) + 1;
 
-       hsize /= sizeof(xfs_dqhash_t);
+                       error = execute(batch[i]);
+                       if (error == EAGAIN) {
+                               skipped++;
+                               continue;
+                       }
+                       if (error && last_error != EFSCORRUPTED)
+                               last_error = error;
+               }
 
-       xqm = kmem_zalloc(sizeof(xfs_qm_t), KM_SLEEP);
-       xqm->qm_dqhashmask = hsize - 1;
-       xqm->qm_usr_dqhtable = udqhash;
-       xqm->qm_grp_dqhtable = gdqhash;
-       ASSERT(xqm->qm_usr_dqhtable != NULL);
-       ASSERT(xqm->qm_grp_dqhtable != NULL);
+               mutex_unlock(&qi->qi_tree_lock);
 
-       for (i = 0; i < hsize; i++) {
-               xfs_qm_list_init(&(xqm->qm_usr_dqhtable[i]), "uxdqh", i);
-               xfs_qm_list_init(&(xqm->qm_grp_dqhtable[i]), "gxdqh", i);
+               /* bail out if the filesystem is corrupted.  */
+               if (last_error == EFSCORRUPTED) {
+                       skipped = 0;
+                       break;
+               }
        }
 
-       /*
-        * Freelist of all dquots of all file systems
-        */
-       INIT_LIST_HEAD(&xqm->qm_dqfrlist);
-       xqm->qm_dqfrlist_cnt = 0;
-       mutex_init(&xqm->qm_dqfrlist_lock);
-
-       /*
-        * dquot zone. we register our own low-memory callback.
-        */
-       if (!qm_dqzone) {
-               xqm->qm_dqzone = kmem_zone_init(sizeof(xfs_dquot_t),
-                                               "xfs_dquots");
-               qm_dqzone = xqm->qm_dqzone;
-       } else
-               xqm->qm_dqzone = qm_dqzone;
-
-       register_shrinker(&xfs_qm_shaker);
-
-       /*
-        * The t_dqinfo portion of transactions.
-        */
-       if (!qm_dqtrxzone) {
-               xqm->qm_dqtrxzone = kmem_zone_init(sizeof(xfs_dquot_acct_t),
-                                                  "xfs_dqtrx");
-               qm_dqtrxzone = xqm->qm_dqtrxzone;
-       } else
-               xqm->qm_dqtrxzone = qm_dqtrxzone;
-
-       atomic_set(&xqm->qm_totaldquots, 0);
-       xqm->qm_nrefs = 0;
-       return xqm;
+       if (skipped) {
+               delay(1);
+               goto restart;
+       }
 
- out_free_udqhash:
-       kmem_free_large(udqhash);
- out:
-       return NULL;
+       return last_error;
 }
 
+
 /*
- * Destroy the global quota manager when its reference count goes to zero.
+ * Purge a dquot from all tracking data structures and free it.
  */
-STATIC void
-xfs_qm_destroy(
-       struct xfs_qm   *xqm)
+STATIC int
+xfs_qm_dqpurge(
+       struct xfs_dquot        *dqp)
 {
-       int             hsize, i;
+       struct xfs_mount        *mp = dqp->q_mount;
+       struct xfs_quotainfo    *qi = mp->m_quotainfo;
+       struct xfs_dquot        *gdqp = NULL;
 
-       ASSERT(xqm != NULL);
-       ASSERT(xqm->qm_nrefs == 0);
+       xfs_dqlock(dqp);
+       if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) {
+               xfs_dqunlock(dqp);
+               return EAGAIN;
+       }
 
-       unregister_shrinker(&xfs_qm_shaker);
+       /*
+        * If this quota has a group hint attached, prepare for releasing it
+        * now.
+        */
+       gdqp = dqp->q_gdquot;
+       if (gdqp) {
+               xfs_dqlock(gdqp);
+               dqp->q_gdquot = NULL;
+       }
 
-       mutex_lock(&xqm->qm_dqfrlist_lock);
-       ASSERT(list_empty(&xqm->qm_dqfrlist));
-       mutex_unlock(&xqm->qm_dqfrlist_lock);
+       dqp->dq_flags |= XFS_DQ_FREEING;
 
-       hsize = xqm->qm_dqhashmask + 1;
-       for (i = 0; i < hsize; i++) {
-               xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i]));
-               xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i]));
+       /*
+        * If we're turning off quotas, we have to make sure that, for
+        * example, we don't delete quota disk blocks while dquots are
+        * in the process of getting written to those disk blocks.
+        * This dquot might well be on AIL, and we can't leave it there
+        * if we're turning off quotas. Basically, we need this flush
+        * lock, and are willing to block on it.
+        */
+       if (!xfs_dqflock_nowait(dqp)) {
+               /*
+                * Block on the flush lock after nudging dquot buffer,
+                * if it is incore.
+                */
+               xfs_dqflock_pushbuf_wait(dqp);
        }
-       kmem_free_large(xqm->qm_usr_dqhtable);
-       kmem_free_large(xqm->qm_grp_dqhtable);
-       xqm->qm_usr_dqhtable = NULL;
-       xqm->qm_grp_dqhtable = NULL;
-       xqm->qm_dqhashmask = 0;
 
-       kmem_free(xqm);
-}
-
-/*
- * Called at mount time to let XQM know that another file system is
- * starting quotas. This isn't crucial information as the individual mount
- * structures are pretty independent, but it helps the XQM keep a
- * global view of what's going on.
- */
-/* ARGSUSED */
-STATIC int
-xfs_qm_hold_quotafs_ref(
-       struct xfs_mount *mp)
-{
        /*
-        * Need to lock the xfs_Gqm structure for things like this. For example,
-        * the structure could disappear between the entry to this routine and
-        * a HOLD operation if not locked.
+        * If we are turning this type of quotas off, we don't care
+        * about the dirty metadata sitting in this dquot. OTOH, if
+        * we're unmounting, we do care, so we flush it and wait.
         */
-       mutex_lock(&xfs_Gqm_lock);
+       if (XFS_DQ_IS_DIRTY(dqp)) {
+               int     error;
 
-       if (!xfs_Gqm) {
-               xfs_Gqm = xfs_Gqm_init();
-               if (!xfs_Gqm) {
-                       mutex_unlock(&xfs_Gqm_lock);
-                       return ENOMEM;
-               }
+               /*
+                * We don't care about getting disk errors here. We need
+                * to purge this dquot anyway, so we go ahead regardless.
+                */
+               error = xfs_qm_dqflush(dqp, SYNC_WAIT);
+               if (error)
+                       xfs_warn(mp, "%s: dquot %p flush failed",
+                               __func__, dqp);
+               xfs_dqflock(dqp);
        }
 
+       ASSERT(atomic_read(&dqp->q_pincount) == 0);
+       ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
+              !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
+
+       xfs_dqfunlock(dqp);
+       xfs_dqunlock(dqp);
+
+       radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags),
+                         be32_to_cpu(dqp->q_core.d_id));
+       qi->qi_dquots--;
+
        /*
-        * We can keep a list of all filesystems with quotas mounted for
-        * debugging and statistical purposes, but ...
-        * Just take a reference and get out.
+        * We move dquots to the freelist as soon as their reference count
+        * hits zero, so it really should be on the freelist here.
         */
-       xfs_Gqm->qm_nrefs++;
-       mutex_unlock(&xfs_Gqm_lock);
+       mutex_lock(&qi->qi_lru_lock);
+       ASSERT(!list_empty(&dqp->q_lru));
+       list_del_init(&dqp->q_lru);
+       qi->qi_lru_count--;
+       XFS_STATS_DEC(xs_qm_dquot_unused);
+       mutex_unlock(&qi->qi_lru_lock);
 
+       xfs_qm_dqdestroy(dqp);
+
+       if (gdqp)
+               xfs_qm_dqput(gdqp);
        return 0;
 }
 
-
 /*
- * Release the reference that a filesystem took at mount time,
- * so that we know when we need to destroy the entire quota manager.
+ * Purge the dquot cache.
  */
-/* ARGSUSED */
-STATIC void
-xfs_qm_rele_quotafs_ref(
-       struct xfs_mount *mp)
+void
+xfs_qm_dqpurge_all(
+       struct xfs_mount        *mp,
+       uint                    flags)
 {
-       ASSERT(xfs_Gqm);
-       ASSERT(xfs_Gqm->qm_nrefs > 0);
-
-       /*
-        * Destroy the entire XQM. If somebody mounts with quotaon, this'll
-        * be restarted.
-        */
-       mutex_lock(&xfs_Gqm_lock);
-       if (--xfs_Gqm->qm_nrefs == 0) {
-               xfs_qm_destroy(xfs_Gqm);
-               xfs_Gqm = NULL;
-       }
-       mutex_unlock(&xfs_Gqm_lock);
+       if (flags & XFS_QMOPT_UQUOTA)
+               xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge);
+       if (flags & XFS_QMOPT_GQUOTA)
+               xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge);
+       if (flags & XFS_QMOPT_PQUOTA)
+               xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_dqpurge);
 }
 
 /*
@@ -376,175 +371,6 @@ xfs_qm_unmount_quotas(
        }
 }
 
-/*
- * Flush all dquots of the given file system to disk. The dquots are
- * _not_ purged from memory here, just their data written to disk.
- */
-STATIC int
-xfs_qm_dqflush_all(
-       struct xfs_mount        *mp)
-{
-       struct xfs_quotainfo    *q = mp->m_quotainfo;
-       int                     recl;
-       struct xfs_dquot        *dqp;
-       int                     error;
-
-       if (!q)
-               return 0;
-again:
-       mutex_lock(&q->qi_dqlist_lock);
-       list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
-               xfs_dqlock(dqp);
-               if ((dqp->dq_flags & XFS_DQ_FREEING) ||
-                   !XFS_DQ_IS_DIRTY(dqp)) {
-                       xfs_dqunlock(dqp);
-                       continue;
-               }
-
-               /* XXX a sentinel would be better */
-               recl = q->qi_dqreclaims;
-               if (!xfs_dqflock_nowait(dqp)) {
-                       /*
-                        * If we can't grab the flush lock then check
-                        * to see if the dquot has been flushed delayed
-                        * write.  If so, grab its buffer and send it
-                        * out immediately.  We'll be able to acquire
-                        * the flush lock when the I/O completes.
-                        */
-                       xfs_dqflock_pushbuf_wait(dqp);
-               }
-               /*
-                * Let go of the mplist lock. We don't want to hold it
-                * across a disk write.
-                */
-               mutex_unlock(&q->qi_dqlist_lock);
-               error = xfs_qm_dqflush(dqp, 0);
-               xfs_dqunlock(dqp);
-               if (error)
-                       return error;
-
-               mutex_lock(&q->qi_dqlist_lock);
-               if (recl != q->qi_dqreclaims) {
-                       mutex_unlock(&q->qi_dqlist_lock);
-                       /* XXX restart limit */
-                       goto again;
-               }
-       }
-
-       mutex_unlock(&q->qi_dqlist_lock);
-       /* return ! busy */
-       return 0;
-}
-
-/*
- * Release the group dquot pointers the user dquots may be
- * carrying around as a hint. mplist is locked on entry and exit.
- */
-STATIC void
-xfs_qm_detach_gdquots(
-       struct xfs_mount        *mp)
-{
-       struct xfs_quotainfo    *q = mp->m_quotainfo;
-       struct xfs_dquot        *dqp, *gdqp;
-
- again:
-       ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
-       list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
-               xfs_dqlock(dqp);
-               if (dqp->dq_flags & XFS_DQ_FREEING) {
-                       xfs_dqunlock(dqp);
-                       mutex_unlock(&q->qi_dqlist_lock);
-                       delay(1);
-                       mutex_lock(&q->qi_dqlist_lock);
-                       goto again;
-               }
-
-               gdqp = dqp->q_gdquot;
-               if (gdqp)
-                       dqp->q_gdquot = NULL;
-               xfs_dqunlock(dqp);
-
-               if (gdqp)
-                       xfs_qm_dqrele(gdqp);
-       }
-}
-
-/*
- * Go through all the incore dquots of this file system and take them
- * off the mplist and hashlist, if the dquot type matches the dqtype
- * parameter. This is used when turning off quota accounting for
- * users and/or groups, as well as when the filesystem is unmounting.
- */
-STATIC int
-xfs_qm_dqpurge_int(
-       struct xfs_mount        *mp,
-       uint                    flags)
-{
-       struct xfs_quotainfo    *q = mp->m_quotainfo;
-       struct xfs_dquot        *dqp, *n;
-       uint                    dqtype;
-       int                     nmisses = 0;
-       LIST_HEAD               (dispose_list);
-
-       if (!q)
-               return 0;
-
-       dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0;
-       dqtype |= (flags & XFS_QMOPT_PQUOTA) ? XFS_DQ_PROJ : 0;
-       dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0;
-
-       mutex_lock(&q->qi_dqlist_lock);
-
-       /*
-        * In the first pass through all incore dquots of this filesystem,
-        * we release the group dquot pointers the user dquots may be
-        * carrying around as a hint. We need to do this irrespective of
-        * what's being turned off.
-        */
-       xfs_qm_detach_gdquots(mp);
-
-       /*
-        * Try to get rid of all of the unwanted dquots.
-        */
-       list_for_each_entry_safe(dqp, n, &q->qi_dqlist, q_mplist) {
-               xfs_dqlock(dqp);
-               if ((dqp->dq_flags & dqtype) != 0 &&
-                   !(dqp->dq_flags & XFS_DQ_FREEING)) {
-                       if (dqp->q_nrefs == 0) {
-                               dqp->dq_flags |= XFS_DQ_FREEING;
-                               list_move_tail(&dqp->q_mplist, &dispose_list);
-                       } else
-                               nmisses++;
-               }
-               xfs_dqunlock(dqp);
-       }
-       mutex_unlock(&q->qi_dqlist_lock);
-
-       list_for_each_entry_safe(dqp, n, &dispose_list, q_mplist)
-               xfs_qm_dqpurge(dqp);
-
-       return nmisses;
-}
-
-int
-xfs_qm_dqpurge_all(
-       xfs_mount_t     *mp,
-       uint            flags)
-{
-       int             ndquots;
-
-       /*
-        * Purge the dquot cache.
-        * None of the dquots should really be busy at this point.
-        */
-       if (mp->m_quotainfo) {
-               while ((ndquots = xfs_qm_dqpurge_int(mp, flags))) {
-                       delay(ndquots * 10);
-               }
-       }
-       return 0;
-}
-
 STATIC int
 xfs_qm_dqattach_one(
        xfs_inode_t     *ip,
@@ -782,14 +608,6 @@ xfs_qm_dqdetach(
        }
 }
 
-/*
- * The hash chains and the mplist use the same xfs_dqhash structure as
- * their list head, but we can take the mplist qh_lock and one of the
- * hash qh_locks at the same time without any problem as they aren't
- * related.
- */
-static struct lock_class_key xfs_quota_mplist_class;
-
 /*
  * This initializes all the quota information that's kept in the
  * mount structure
@@ -804,13 +622,6 @@ xfs_qm_init_quotainfo(
 
        ASSERT(XFS_IS_QUOTA_RUNNING(mp));
 
-       /*
-        * Tell XQM that we exist as soon as possible.
-        */
-       if ((error = xfs_qm_hold_quotafs_ref(mp))) {
-               return error;
-       }
-
        qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
 
        /*
@@ -823,11 +634,13 @@ xfs_qm_init_quotainfo(
                return error;
        }
 
-       INIT_LIST_HEAD(&qinf->qi_dqlist);
-       mutex_init(&qinf->qi_dqlist_lock);
-       lockdep_set_class(&qinf->qi_dqlist_lock, &xfs_quota_mplist_class);
+       INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS);
+       INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS);
+       mutex_init(&qinf->qi_tree_lock);
 
-       qinf->qi_dqreclaims = 0;
+       INIT_LIST_HEAD(&qinf->qi_lru_list);
+       qinf->qi_lru_count = 0;
+       mutex_init(&qinf->qi_lru_lock);
 
        /* mutex used to serialize quotaoffs */
        mutex_init(&qinf->qi_quotaofflock);
@@ -894,6 +707,9 @@ xfs_qm_init_quotainfo(
                qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT;
        }
 
+       qinf->qi_shrinker.shrink = xfs_qm_shake;
+       qinf->qi_shrinker.seeks = DEFAULT_SEEKS;
+       register_shrinker(&qinf->qi_shrinker);
        return 0;
 }
 
@@ -911,17 +727,8 @@ xfs_qm_destroy_quotainfo(
 
        qi = mp->m_quotainfo;
        ASSERT(qi != NULL);
-       ASSERT(xfs_Gqm != NULL);
-
-       /*
-        * Release the reference that XQM kept, so that we know
-        * when the XQM structure should be freed. We cannot assume
-        * that xfs_Gqm is non-null after this point.
-        */
-       xfs_qm_rele_quotafs_ref(mp);
 
-       ASSERT(list_empty(&qi->qi_dqlist));
-       mutex_destroy(&qi->qi_dqlist_lock);
+       unregister_shrinker(&qi->qi_shrinker);
 
        if (qi->qi_uquotaip) {
                IRELE(qi->qi_uquotaip);
@@ -936,30 +743,6 @@ xfs_qm_destroy_quotainfo(
        mp->m_quotainfo = NULL;
 }
 
-
-
-/* ------------------- PRIVATE STATIC FUNCTIONS ----------------------- */
-
-/* ARGSUSED */
-STATIC void
-xfs_qm_list_init(
-       xfs_dqlist_t    *list,
-       char            *str,
-       int             n)
-{
-       mutex_init(&list->qh_lock);
-       INIT_LIST_HEAD(&list->qh_list);
-       list->qh_version = 0;
-       list->qh_nelems = 0;
-}
-
-STATIC void
-xfs_qm_list_destroy(
-       xfs_dqlist_t    *list)
-{
-       mutex_destroy(&(list->qh_lock));
-}
-
 /*
  * Create an inode and return with a reference already taken, but unlocked
  * This is how we create quota inodes
@@ -1397,6 +1180,28 @@ error0:
        return error;
 }
 
+STATIC int
+xfs_qm_flush_one(
+       struct xfs_dquot        *dqp)
+{
+       int                     error = 0;
+
+       xfs_dqlock(dqp);
+       if (dqp->dq_flags & XFS_DQ_FREEING)
+               goto out_unlock;
+       if (!XFS_DQ_IS_DIRTY(dqp))
+               goto out_unlock;
+
+       if (!xfs_dqflock_nowait(dqp))
+               xfs_dqflock_pushbuf_wait(dqp);
+
+       error = xfs_qm_dqflush(dqp, 0);
+
+out_unlock:
+       xfs_dqunlock(dqp);
+       return error;
+}
+
 /*
  * Walk thru all the filesystem inodes and construct a consistent view
  * of the disk quota world. If the quotacheck fails, disable quotas.
@@ -1405,7 +1210,7 @@ int
 xfs_qm_quotacheck(
        xfs_mount_t     *mp)
 {
-       int             done, count, error;
+       int             done, count, error, error2;
        xfs_ino_t       lastino;
        size_t          structsz;
        xfs_inode_t     *uip, *gip;
@@ -1419,12 +1224,6 @@ xfs_qm_quotacheck(
        ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip);
        ASSERT(XFS_IS_QUOTA_RUNNING(mp));
 
-       /*
-        * There should be no cached dquots. The (simplistic) quotacheck
-        * algorithm doesn't like that.
-        */
-       ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist));
-
        xfs_notice(mp, "Quotacheck needed: Please wait.");
 
        /*
@@ -1463,12 +1262,21 @@ xfs_qm_quotacheck(
        } while (!done);
 
        /*
-        * We've made all the changes that we need to make incore.
-        * Flush them down to disk buffers if everything was updated
-        * successfully.
+        * We've made all the changes that we need to make incore.  Flush them
+        * down to disk buffers if everything was updated successfully.
         */
-       if (!error)
-               error = xfs_qm_dqflush_all(mp);
+       if (XFS_IS_UQUOTA_ON(mp))
+               error = xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_flush_one);
+       if (XFS_IS_GQUOTA_ON(mp)) {
+               error2 = xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_flush_one);
+               if (!error)
+                       error = error2;
+       }
+       if (XFS_IS_PQUOTA_ON(mp)) {
+               error2 = xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_flush_one);
+               if (!error)
+                       error = error2;
+       }
 
        /*
         * We can get this error if we couldn't do a dquot allocation inside
@@ -1496,7 +1304,7 @@ xfs_qm_quotacheck(
         * quotachecked status, since we won't be doing accounting for
         * that type anymore.
         */
-       mp->m_qflags &= ~(XFS_OQUOTA_CHKD | XFS_UQUOTA_CHKD);
+       mp->m_qflags &= ~XFS_ALL_QUOTA_CHKD;
        mp->m_qflags |= flags;
 
  error_return:
@@ -1508,7 +1316,6 @@ xfs_qm_quotacheck(
                 * We must turn off quotas.
                 */
                ASSERT(mp->m_quotainfo != NULL);
-               ASSERT(xfs_Gqm != NULL);
                xfs_qm_destroy_quotainfo(mp);
                if (xfs_mount_reset_sbqflags(mp)) {
                        xfs_warn(mp,
@@ -1604,16 +1411,12 @@ xfs_qm_dqfree_one(
        struct xfs_mount        *mp = dqp->q_mount;
        struct xfs_quotainfo    *qi = mp->m_quotainfo;
 
-       mutex_lock(&dqp->q_hash->qh_lock);
-       list_del_init(&dqp->q_hashlist);
-       dqp->q_hash->qh_version++;
-       mutex_unlock(&dqp->q_hash->qh_lock);
+       mutex_lock(&qi->qi_tree_lock);
+       radix_tree_delete(XFS_DQUOT_TREE(qi, dqp->q_core.d_flags),
+                         be32_to_cpu(dqp->q_core.d_id));
 
-       mutex_lock(&qi->qi_dqlist_lock);
-       list_del_init(&dqp->q_mplist);
        qi->qi_dquots--;
-       qi->qi_dqreclaims++;
-       mutex_unlock(&qi->qi_dqlist_lock);
+       mutex_unlock(&qi->qi_tree_lock);
 
        xfs_qm_dqdestroy(dqp);
 }
@@ -1624,6 +1427,7 @@ xfs_qm_dqreclaim_one(
        struct list_head        *dispose_list)
 {
        struct xfs_mount        *mp = dqp->q_mount;
+       struct xfs_quotainfo    *qi = mp->m_quotainfo;
        int                     error;
 
        if (!xfs_dqlock_nowait(dqp))
@@ -1637,16 +1441,14 @@ xfs_qm_dqreclaim_one(
                xfs_dqunlock(dqp);
 
                trace_xfs_dqreclaim_want(dqp);
-               XQM_STATS_INC(xqmstats.xs_qm_dqwants);
+               XFS_STATS_INC(xs_qm_dqwants);
 
-               list_del_init(&dqp->q_freelist);
-               xfs_Gqm->qm_dqfrlist_cnt--;
+               list_del_init(&dqp->q_lru);
+               qi->qi_lru_count--;
+               XFS_STATS_DEC(xs_qm_dquot_unused);
                return;
        }
 
-       ASSERT(dqp->q_hash);
-       ASSERT(!list_empty(&dqp->q_mplist));
-
        /*
         * Try to grab the flush lock. If this dquot is in the process of
         * getting flushed to disk, we don't want to reclaim it.
@@ -1688,11 +1490,12 @@ xfs_qm_dqreclaim_one(
        xfs_dqunlock(dqp);
 
        ASSERT(dqp->q_nrefs == 0);
-       list_move_tail(&dqp->q_freelist, dispose_list);
-       xfs_Gqm->qm_dqfrlist_cnt--;
+       list_move_tail(&dqp->q_lru, dispose_list);
+       qi->qi_lru_count--;
+       XFS_STATS_DEC(xs_qm_dquot_unused);
 
        trace_xfs_dqreclaim_done(dqp);
-       XQM_STATS_INC(xqmstats.xs_qm_dqreclaims);
+       XFS_STATS_INC(xs_qm_dqreclaims);
        return;
 
 out_busy:
@@ -1701,10 +1504,10 @@ out_busy:
        /*
         * Move the dquot to the tail of the list so that we don't spin on it.
         */
-       list_move_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist);
+       list_move_tail(&dqp->q_lru, &qi->qi_lru_list);
 
        trace_xfs_dqreclaim_busy(dqp);
-       XQM_STATS_INC(xqmstats.xs_qm_dqreclaim_misses);
+       XFS_STATS_INC(xs_qm_dqreclaim_misses);
 }
 
 STATIC int
@@ -1712,6 +1515,8 @@ xfs_qm_shake(
        struct shrinker         *shrink,
        struct shrink_control   *sc)
 {
+       struct xfs_quotainfo    *qi =
+               container_of(shrink, struct xfs_quotainfo, qi_shrinker);
        int                     nr_to_scan = sc->nr_to_scan;
        LIST_HEAD               (dispose_list);
        struct xfs_dquot        *dqp;
@@ -1721,24 +1526,23 @@ xfs_qm_shake(
        if (!nr_to_scan)
                goto out;
 
-       mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
-       while (!list_empty(&xfs_Gqm->qm_dqfrlist)) {
+       mutex_lock(&qi->qi_lru_lock);
+       while (!list_empty(&qi->qi_lru_list)) {
                if (nr_to_scan-- <= 0)
                        break;
-               dqp = list_first_entry(&xfs_Gqm->qm_dqfrlist, struct xfs_dquot,
-                                      q_freelist);
+               dqp = list_first_entry(&qi->qi_lru_list, struct xfs_dquot,
+                                      q_lru);
                xfs_qm_dqreclaim_one(dqp, &dispose_list);
        }
-       mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+       mutex_unlock(&qi->qi_lru_lock);
 
        while (!list_empty(&dispose_list)) {
-               dqp = list_first_entry(&dispose_list, struct xfs_dquot,
-                                      q_freelist);
-               list_del_init(&dqp->q_freelist);
+               dqp = list_first_entry(&dispose_list, struct xfs_dquot, q_lru);
+               list_del_init(&dqp->q_lru);
                xfs_qm_dqfree_one(dqp);
        }
 out:
-       return (xfs_Gqm->qm_dqfrlist_cnt / 100) * sysctl_vfs_cache_pressure;
+       return (qi->qi_lru_count / 100) * sysctl_vfs_cache_pressure;
 }
 
 /*
index 9a9b997e1a0a294bd6f180ccaac011f2ec49a517..44b858b79d716709a155976f01183fa056f25fcf 100644 (file)
 #include "xfs_dquot_item.h"
 #include "xfs_dquot.h"
 #include "xfs_quota_priv.h"
-#include "xfs_qm_stats.h"
 
-struct xfs_qm;
 struct xfs_inode;
 
-extern struct mutex    xfs_Gqm_lock;
-extern struct xfs_qm   *xfs_Gqm;
-extern kmem_zone_t     *qm_dqzone;
-extern kmem_zone_t     *qm_dqtrxzone;
-
-/*
- * Dquot hashtable constants/threshold values.
- */
-#define XFS_QM_HASHSIZE_LOW            (PAGE_SIZE / sizeof(xfs_dqhash_t))
-#define XFS_QM_HASHSIZE_HIGH           ((PAGE_SIZE * 4) / sizeof(xfs_dqhash_t))
+extern struct kmem_zone        *xfs_qm_dqtrxzone;
 
 /*
  * This defines the unit of allocation of dquots.
@@ -48,36 +37,20 @@ extern kmem_zone_t  *qm_dqtrxzone;
  */
 #define XFS_DQUOT_CLUSTER_SIZE_FSB     (xfs_filblks_t)1
 
-typedef xfs_dqhash_t   xfs_dqlist_t;
-
-/*
- * Quota Manager (global) structure. Lives only in core.
- */
-typedef struct xfs_qm {
-       xfs_dqlist_t    *qm_usr_dqhtable;/* udquot hash table */
-       xfs_dqlist_t    *qm_grp_dqhtable;/* gdquot hash table */
-       uint             qm_dqhashmask;  /* # buckets in dq hashtab - 1 */
-       struct list_head qm_dqfrlist;    /* freelist of dquots */
-       struct mutex     qm_dqfrlist_lock;
-       int              qm_dqfrlist_cnt;
-       atomic_t         qm_totaldquots; /* total incore dquots */
-       uint             qm_nrefs;       /* file systems with quota on */
-       kmem_zone_t     *qm_dqzone;      /* dquot mem-alloc zone */
-       kmem_zone_t     *qm_dqtrxzone;   /* t_dqinfo of transactions */
-} xfs_qm_t;
-
 /*
  * Various quota information for individual filesystems.
  * The mount structure keeps a pointer to this.
  */
 typedef struct xfs_quotainfo {
+       struct radix_tree_root qi_uquota_tree;
+       struct radix_tree_root qi_gquota_tree;
+       struct mutex qi_tree_lock;
        xfs_inode_t     *qi_uquotaip;    /* user quota inode */
        xfs_inode_t     *qi_gquotaip;    /* group quota inode */
-       struct list_head qi_dqlist;      /* all dquots in filesys */
-       struct mutex     qi_dqlist_lock;
+       struct list_head qi_lru_list;
+       struct mutex     qi_lru_lock;
+       int              qi_lru_count;
        int              qi_dquots;
-       int              qi_dqreclaims;  /* a change here indicates
-                                           a removal in the dqlist */
        time_t           qi_btimelimit;  /* limit for blks timer */
        time_t           qi_itimelimit;  /* limit for inodes timer */
        time_t           qi_rtbtimelimit;/* limit for rt blks timer */
@@ -93,8 +66,14 @@ typedef struct xfs_quotainfo {
        xfs_qcnt_t       qi_isoftlimit;  /* default inode count soft limit */
        xfs_qcnt_t       qi_rtbhardlimit;/* default realtime blk hard limit */
        xfs_qcnt_t       qi_rtbsoftlimit;/* default realtime blk soft limit */
+       struct shrinker  qi_shrinker;
 } xfs_quotainfo_t;
 
+#define XFS_DQUOT_TREE(qi, type) \
+       ((type & XFS_DQ_USER) ? \
+        &((qi)->qi_uquota_tree) : \
+        &((qi)->qi_gquota_tree))
+
 
 extern void    xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long);
 extern int     xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *,
@@ -130,7 +109,7 @@ extern int          xfs_qm_quotacheck(xfs_mount_t *);
 extern int             xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t);
 
 /* dquot stuff */
-extern int             xfs_qm_dqpurge_all(xfs_mount_t *, uint);
+extern void            xfs_qm_dqpurge_all(xfs_mount_t *, uint);
 extern void            xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint);
 
 /* quota ops */
index a0a829addca9d3c79c649201cf3f9d76432847c1..e6986b5d80d8fca7d9add0e7277c6f3a61859cdd 100644 (file)
 STATIC void
 xfs_fill_statvfs_from_dquot(
        struct kstatfs          *statp,
-       xfs_disk_dquot_t        *dp)
+       struct xfs_dquot        *dqp)
 {
        __uint64_t              limit;
 
-       limit = dp->d_blk_softlimit ?
-               be64_to_cpu(dp->d_blk_softlimit) :
-               be64_to_cpu(dp->d_blk_hardlimit);
+       limit = dqp->q_core.d_blk_softlimit ?
+               be64_to_cpu(dqp->q_core.d_blk_softlimit) :
+               be64_to_cpu(dqp->q_core.d_blk_hardlimit);
        if (limit && statp->f_blocks > limit) {
                statp->f_blocks = limit;
                statp->f_bfree = statp->f_bavail =
-                       (statp->f_blocks > be64_to_cpu(dp->d_bcount)) ?
-                        (statp->f_blocks - be64_to_cpu(dp->d_bcount)) : 0;
+                       (statp->f_blocks > dqp->q_res_bcount) ?
+                        (statp->f_blocks - dqp->q_res_bcount) : 0;
        }
 
-       limit = dp->d_ino_softlimit ?
-               be64_to_cpu(dp->d_ino_softlimit) :
-               be64_to_cpu(dp->d_ino_hardlimit);
+       limit = dqp->q_core.d_ino_softlimit ?
+               be64_to_cpu(dqp->q_core.d_ino_softlimit) :
+               be64_to_cpu(dqp->q_core.d_ino_hardlimit);
        if (limit && statp->f_files > limit) {
                statp->f_files = limit;
                statp->f_ffree =
-                       (statp->f_files > be64_to_cpu(dp->d_icount)) ?
-                        (statp->f_ffree - be64_to_cpu(dp->d_icount)) : 0;
+                       (statp->f_files > dqp->q_res_icount) ?
+                        (statp->f_ffree - dqp->q_res_icount) : 0;
        }
 }
 
@@ -82,7 +82,7 @@ xfs_qm_statvfs(
        xfs_dquot_t             *dqp;
 
        if (!xfs_qm_dqget(mp, NULL, xfs_get_projid(ip), XFS_DQ_PROJ, 0, &dqp)) {
-               xfs_fill_statvfs_from_dquot(statp, &dqp->q_core);
+               xfs_fill_statvfs_from_dquot(statp, dqp);
                xfs_qm_dqput(dqp);
        }
 }
@@ -156,21 +156,3 @@ xfs_qm_newmount(
 
        return 0;
 }
-
-void __init
-xfs_qm_init(void)
-{
-       printk(KERN_INFO "SGI XFS Quota Management subsystem\n");
-       mutex_init(&xfs_Gqm_lock);
-       xfs_qm_init_procfs();
-}
-
-void __exit
-xfs_qm_exit(void)
-{
-       xfs_qm_cleanup_procfs();
-       if (qm_dqzone)
-               kmem_zone_destroy(qm_dqzone);
-       if (qm_dqtrxzone)
-               kmem_zone_destroy(qm_dqtrxzone);
-}
diff --git a/fs/xfs/xfs_qm_stats.c b/fs/xfs/xfs_qm_stats.c
deleted file mode 100644 (file)
index 5729ba5..0000000
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_alloc.h"
-#include "xfs_quota.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
-#include "xfs_itable.h"
-#include "xfs_bmap.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
-#include "xfs_qm.h"
-
-struct xqmstats xqmstats;
-
-static int xqm_proc_show(struct seq_file *m, void *v)
-{
-       /* maximum; incore; ratio free to inuse; freelist */
-       seq_printf(m, "%d\t%d\t%d\t%u\n",
-                       0,
-                       xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
-                       0,
-                       xfs_Gqm? xfs_Gqm->qm_dqfrlist_cnt : 0);
-       return 0;
-}
-
-static int xqm_proc_open(struct inode *inode, struct file *file)
-{
-       return single_open(file, xqm_proc_show, NULL);
-}
-
-static const struct file_operations xqm_proc_fops = {
-       .owner          = THIS_MODULE,
-       .open           = xqm_proc_open,
-       .read           = seq_read,
-       .llseek         = seq_lseek,
-       .release        = single_release,
-};
-
-static int xqmstat_proc_show(struct seq_file *m, void *v)
-{
-       /* quota performance statistics */
-       seq_printf(m, "qm %u %u %u %u %u %u %u %u\n",
-                       xqmstats.xs_qm_dqreclaims,
-                       xqmstats.xs_qm_dqreclaim_misses,
-                       xqmstats.xs_qm_dquot_dups,
-                       xqmstats.xs_qm_dqcachemisses,
-                       xqmstats.xs_qm_dqcachehits,
-                       xqmstats.xs_qm_dqwants,
-                       xqmstats.xs_qm_dqshake_reclaims,
-                       xqmstats.xs_qm_dqinact_reclaims);
-       return 0;
-}
-
-static int xqmstat_proc_open(struct inode *inode, struct file *file)
-{
-       return single_open(file, xqmstat_proc_show, NULL);
-}
-
-static const struct file_operations xqmstat_proc_fops = {
-       .owner          = THIS_MODULE,
-       .open           = xqmstat_proc_open,
-       .read           = seq_read,
-       .llseek         = seq_lseek,
-       .release        = single_release,
-};
-
-void
-xfs_qm_init_procfs(void)
-{
-       proc_create("fs/xfs/xqmstat", 0, NULL, &xqmstat_proc_fops);
-       proc_create("fs/xfs/xqm", 0, NULL, &xqm_proc_fops);
-}
-
-void
-xfs_qm_cleanup_procfs(void)
-{
-       remove_proc_entry("fs/xfs/xqm", NULL);
-       remove_proc_entry("fs/xfs/xqmstat", NULL);
-}
diff --git a/fs/xfs/xfs_qm_stats.h b/fs/xfs/xfs_qm_stats.h
deleted file mode 100644 (file)
index 5b964fc..0000000
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2002 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_QM_STATS_H__
-#define __XFS_QM_STATS_H__
-
-#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF)
-
-/*
- * XQM global statistics
- */
-struct xqmstats {
-       __uint32_t              xs_qm_dqreclaims;
-       __uint32_t              xs_qm_dqreclaim_misses;
-       __uint32_t              xs_qm_dquot_dups;
-       __uint32_t              xs_qm_dqcachemisses;
-       __uint32_t              xs_qm_dqcachehits;
-       __uint32_t              xs_qm_dqwants;
-       __uint32_t              xs_qm_dqshake_reclaims;
-       __uint32_t              xs_qm_dqinact_reclaims;
-};
-
-extern struct xqmstats xqmstats;
-
-# define XQM_STATS_INC(count)  ( (count)++ )
-
-extern void xfs_qm_init_procfs(void);
-extern void xfs_qm_cleanup_procfs(void);
-
-#else
-
-# define XQM_STATS_INC(count)  do { } while (0)
-
-static inline void xfs_qm_init_procfs(void) { };
-static inline void xfs_qm_cleanup_procfs(void) { };
-
-#endif
-
-#endif /* __XFS_QM_STATS_H__ */
index 711a86e39ff046d302a5ff14695882b65bcb3e25..c4f396e437a87656599d21a25186f426556fbcae 100644 (file)
@@ -47,9 +47,6 @@ STATIC int    xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
                                        uint);
 STATIC uint    xfs_qm_export_flags(uint);
 STATIC uint    xfs_qm_export_qtype_flags(uint);
-STATIC void    xfs_qm_export_dquot(xfs_mount_t *, xfs_disk_dquot_t *,
-                                       fs_disk_quota_t *);
-
 
 /*
  * Turn off quota accounting and/or enforcement for all udquots and/or
@@ -69,7 +66,6 @@ xfs_qm_scall_quotaoff(
        int                     error;
        uint                    inactivate_flags;
        xfs_qoff_logitem_t      *qoffstart;
-       int                     nculprits;
 
        /*
         * No file system can have quotas enabled on disk but not in core.
@@ -175,18 +171,13 @@ xfs_qm_scall_quotaoff(
         * This isn't protected by a particular lock directly, because we
         * don't want to take a mrlock every time we depend on quotas being on.
         */
-       mp->m_qflags &= ~(flags);
+       mp->m_qflags &= ~flags;
 
        /*
         * Go through all the dquots of this file system and purge them,
-        * according to what was turned off. We may not be able to get rid
-        * of all dquots, because dquots can have temporary references that
-        * are not attached to inodes. eg. xfs_setattr, xfs_create.
-        * So, if we couldn't purge all the dquots from the filesystem,
-        * we can't get rid of the incore data structures.
+        * according to what was turned off.
         */
-       while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype)))
-               delay(10 * nculprits);
+       xfs_qm_dqpurge_all(mp, dqtype);
 
        /*
         * Transactions that had started before ACTIVE state bit was cleared
@@ -635,42 +626,6 @@ xfs_qm_scall_setqlim(
        return error;
 }
 
-int
-xfs_qm_scall_getquota(
-       xfs_mount_t     *mp,
-       xfs_dqid_t      id,
-       uint            type,
-       fs_disk_quota_t *out)
-{
-       xfs_dquot_t     *dqp;
-       int             error;
-
-       /*
-        * Try to get the dquot. We don't want it allocated on disk, so
-        * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't
-        * exist, we'll get ENOENT back.
-        */
-       if ((error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp))) {
-               return (error);
-       }
-
-       /*
-        * If everything's NULL, this dquot doesn't quite exist as far as
-        * our utility programs are concerned.
-        */
-       if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
-               xfs_qm_dqput(dqp);
-               return XFS_ERROR(ENOENT);
-       }
-       /*
-        * Convert the disk dquot to the exportable format
-        */
-       xfs_qm_export_dquot(mp, &dqp->q_core, out);
-       xfs_qm_dqput(dqp);
-       return (error ? XFS_ERROR(EFAULT) : 0);
-}
-
-
 STATIC int
 xfs_qm_log_quotaoff_end(
        xfs_mount_t             *mp,
@@ -759,50 +714,66 @@ error0:
 }
 
 
-/*
- * Translate an internal style on-disk-dquot to the exportable format.
- * The main differences are that the counters/limits are all in Basic
- * Blocks (BBs) instead of the internal FSBs, and all on-disk data has
- * to be converted to the native endianness.
- */
-STATIC void
-xfs_qm_export_dquot(
-       xfs_mount_t             *mp,
-       xfs_disk_dquot_t        *src,
+int
+xfs_qm_scall_getquota(
+       struct xfs_mount        *mp,
+       xfs_dqid_t              id,
+       uint                    type,
        struct fs_disk_quota    *dst)
 {
+       struct xfs_dquot        *dqp;
+       int                     error;
+
+       /*
+        * Try to get the dquot. We don't want it allocated on disk, so
+        * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't
+        * exist, we'll get ENOENT back.
+        */
+       error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp);
+       if (error)
+               return error;
+
+       /*
+        * If everything's NULL, this dquot doesn't quite exist as far as
+        * our utility programs are concerned.
+        */
+       if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
+               error = XFS_ERROR(ENOENT);
+               goto out_put;
+       }
+
        memset(dst, 0, sizeof(*dst));
-       dst->d_version = FS_DQUOT_VERSION;  /* different from src->d_version */
-       dst->d_flags = xfs_qm_export_qtype_flags(src->d_flags);
-       dst->d_id = be32_to_cpu(src->d_id);
+       dst->d_version = FS_DQUOT_VERSION;
+       dst->d_flags = xfs_qm_export_qtype_flags(dqp->q_core.d_flags);
+       dst->d_id = be32_to_cpu(dqp->q_core.d_id);
        dst->d_blk_hardlimit =
-               XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_blk_hardlimit));
+               XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit));
        dst->d_blk_softlimit =
-               XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_blk_softlimit));
-       dst->d_ino_hardlimit = be64_to_cpu(src->d_ino_hardlimit);
-       dst->d_ino_softlimit = be64_to_cpu(src->d_ino_softlimit);
-       dst->d_bcount = XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_bcount));
-       dst->d_icount = be64_to_cpu(src->d_icount);
-       dst->d_btimer = be32_to_cpu(src->d_btimer);
-       dst->d_itimer = be32_to_cpu(src->d_itimer);
-       dst->d_iwarns = be16_to_cpu(src->d_iwarns);
-       dst->d_bwarns = be16_to_cpu(src->d_bwarns);
+               XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_blk_softlimit));
+       dst->d_ino_hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
+       dst->d_ino_softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit);
+       dst->d_bcount = XFS_FSB_TO_BB(mp, dqp->q_res_bcount);
+       dst->d_icount = dqp->q_res_icount;
+       dst->d_btimer = be32_to_cpu(dqp->q_core.d_btimer);
+       dst->d_itimer = be32_to_cpu(dqp->q_core.d_itimer);
+       dst->d_iwarns = be16_to_cpu(dqp->q_core.d_iwarns);
+       dst->d_bwarns = be16_to_cpu(dqp->q_core.d_bwarns);
        dst->d_rtb_hardlimit =
-               XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtb_hardlimit));
+               XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_rtb_hardlimit));
        dst->d_rtb_softlimit =
-               XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtb_softlimit));
-       dst->d_rtbcount = XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtbcount));
-       dst->d_rtbtimer = be32_to_cpu(src->d_rtbtimer);
-       dst->d_rtbwarns = be16_to_cpu(src->d_rtbwarns);
+               XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_rtb_softlimit));
+       dst->d_rtbcount = XFS_FSB_TO_BB(mp, dqp->q_res_rtbcount);
+       dst->d_rtbtimer = be32_to_cpu(dqp->q_core.d_rtbtimer);
+       dst->d_rtbwarns = be16_to_cpu(dqp->q_core.d_rtbwarns);
 
        /*
         * Internally, we don't reset all the timers when quota enforcement
         * gets turned off. No need to confuse the user level code,
         * so return zeroes in that case.
         */
-       if ((!XFS_IS_UQUOTA_ENFORCED(mp) && src->d_flags == XFS_DQ_USER) ||
+       if ((!XFS_IS_UQUOTA_ENFORCED(mp) && dqp->q_core.d_flags == XFS_DQ_USER) ||
            (!XFS_IS_OQUOTA_ENFORCED(mp) &&
-                       (src->d_flags & (XFS_DQ_PROJ | XFS_DQ_GROUP)))) {
+                       (dqp->q_core.d_flags & (XFS_DQ_PROJ | XFS_DQ_GROUP)))) {
                dst->d_btimer = 0;
                dst->d_itimer = 0;
                dst->d_rtbtimer = 0;
@@ -823,6 +794,9 @@ xfs_qm_export_dquot(
                }
        }
 #endif
+out_put:
+       xfs_qm_dqput(dqp);
+       return error;
 }
 
 STATIC uint
index 8a0807e0f979eff06d930c39cdd11b3e03b2fcc4..b50ec5b95d5a89fb4b0972c1761682c27fe8cdd9 100644 (file)
@@ -174,6 +174,8 @@ typedef struct xfs_qoff_logformat {
 #define XFS_UQUOTA_ACTIVE      0x0100  /* uquotas are being turned off */
 #define XFS_PQUOTA_ACTIVE      0x0200  /* pquotas are being turned off */
 #define XFS_GQUOTA_ACTIVE      0x0400  /* gquotas are being turned off */
+#define XFS_ALL_QUOTA_ACTIVE   \
+       (XFS_UQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE)
 
 /*
  * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees
index 94a3d927d716c6ac6075323340affe7110f7d81a..6d86219d93da2c55b98eb26c190b2ec478e31bcd 100644 (file)
  */
 #define XFS_DQITER_MAP_SIZE    10
 
-/*
- * Hash into a bucket in the dquot hash table, based on <mp, id>.
- */
-#define XFS_DQ_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \
-                                (__psunsigned_t)(id)) & \
-                               (xfs_Gqm->qm_dqhashmask - 1))
-#define XFS_DQ_HASH(mp, id, type)   (type == XFS_DQ_USER ? \
-                                    (xfs_Gqm->qm_usr_dqhtable + \
-                                     XFS_DQ_HASHVAL(mp, id)) : \
-                                    (xfs_Gqm->qm_grp_dqhtable + \
-                                     XFS_DQ_HASHVAL(mp, id)))
 #define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
        !dqp->q_core.d_blk_hardlimit && \
        !dqp->q_core.d_blk_softlimit && \
index cb6ae715814a8f87026b26f701abe419aa3ed6c7..f429d9d5d325d8f1e48f13efda8b1b9a2d4b6f7e 100644 (file)
@@ -529,7 +529,6 @@ static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
 #define        XFS_BB_TO_FSB(mp,bb)    \
        (((bb) + (XFS_FSB_TO_BB(mp,1) - 1)) >> (mp)->m_blkbb_log)
 #define        XFS_BB_TO_FSBT(mp,bb)   ((bb) >> (mp)->m_blkbb_log)
-#define        XFS_BB_FSB_OFFSET(mp,bb) ((bb) & ((mp)->m_bsize - 1))
 
 /*
  * File system block to byte conversions.
index 76fdc5861932f8ed5412b4345dbb59a1054f8f9c..ce372b7d5644600ec6e52a23f8ec00584e1f2e9c 100644 (file)
 
 DEFINE_PER_CPU(struct xfsstats, xfsstats);
 
+static int counter_val(int idx)
+{
+       int val = 0, cpu;
+
+       for_each_possible_cpu(cpu)
+               val += *(((__u32 *)&per_cpu(xfsstats, cpu) + idx));
+       return val;
+}
+
 static int xfs_stat_proc_show(struct seq_file *m, void *v)
 {
-       int             c, i, j, val;
+       int             i, j;
        __uint64_t      xs_xstrat_bytes = 0;
        __uint64_t      xs_write_bytes = 0;
        __uint64_t      xs_read_bytes = 0;
@@ -50,20 +59,16 @@ static int xfs_stat_proc_show(struct seq_file *m, void *v)
                { "abtc2",              XFSSTAT_END_ABTC_V2             },
                { "bmbt2",              XFSSTAT_END_BMBT_V2             },
                { "ibt2",               XFSSTAT_END_IBT_V2              },
+               /* we print both series of quota information together */
+               { "qm",                 XFSSTAT_END_QM                  },
        };
 
        /* Loop over all stats groups */
-       for (i=j = 0; i < ARRAY_SIZE(xstats); i++) {
+       for (i = j = 0; i < ARRAY_SIZE(xstats); i++) {
                seq_printf(m, "%s", xstats[i].desc);
                /* inner loop does each group */
-               while (j < xstats[i].endpoint) {
-                       val = 0;
-                       /* sum over all cpus */
-                       for_each_possible_cpu(c)
-                               val += *(((__u32*)&per_cpu(xfsstats, c) + j));
-                       seq_printf(m, " %u", val);
-                       j++;
-               }
+               for (; j < xstats[i].endpoint; j++)
+                       seq_printf(m, " %u", counter_val(j));
                seq_putc(m, '\n');
        }
        /* extra precision counters */
@@ -97,6 +102,58 @@ static const struct file_operations xfs_stat_proc_fops = {
        .release        = single_release,
 };
 
+/* legacy quota interfaces */
+#ifdef CONFIG_XFS_QUOTA
+static int xqm_proc_show(struct seq_file *m, void *v)
+{
+       /* maximum; incore; ratio free to inuse; freelist */
+       seq_printf(m, "%d\t%d\t%d\t%u\n",
+                       0,
+                       counter_val(XFSSTAT_END_XQMSTAT),
+                       0,
+                       counter_val(XFSSTAT_END_XQMSTAT + 1));
+       return 0;
+}
+
+static int xqm_proc_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, xqm_proc_show, NULL);
+}
+
+static const struct file_operations xqm_proc_fops = {
+       .owner          = THIS_MODULE,
+       .open           = xqm_proc_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+/* legacy quota stats interface no 2 */
+static int xqmstat_proc_show(struct seq_file *m, void *v)
+{
+       int j;
+
+       seq_printf(m, "qm");
+       for (j = XFSSTAT_END_IBT_V2; j < XFSSTAT_END_XQMSTAT; j++)
+               seq_printf(m, " %u", counter_val(j));
+       seq_putc(m, '\n');
+       return 0;
+}
+
+static int xqmstat_proc_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, xqmstat_proc_show, NULL);
+}
+
+static const struct file_operations xqmstat_proc_fops = {
+       .owner          = THIS_MODULE,
+       .open           = xqmstat_proc_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+#endif /* CONFIG_XFS_QUOTA */
+
 int
 xfs_init_procfs(void)
 {
@@ -105,10 +162,24 @@ xfs_init_procfs(void)
 
        if (!proc_create("fs/xfs/stat", 0, NULL,
                         &xfs_stat_proc_fops))
-               goto out_remove_entry;
+               goto out_remove_xfs_dir;
+#ifdef CONFIG_XFS_QUOTA
+       if (!proc_create("fs/xfs/xqmstat", 0, NULL,
+                        &xqmstat_proc_fops))
+               goto out_remove_stat_file;
+       if (!proc_create("fs/xfs/xqm", 0, NULL,
+                        &xqm_proc_fops))
+               goto out_remove_xqmstat_file;
+#endif
        return 0;
 
- out_remove_entry:
+#ifdef CONFIG_XFS_QUOTA
+ out_remove_xqmstat_file:
+       remove_proc_entry("fs/xfs/xqmstat", NULL);
+ out_remove_stat_file:
+       remove_proc_entry("fs/xfs/stat", NULL);
+#endif
+ out_remove_xfs_dir:
        remove_proc_entry("fs/xfs", NULL);
  out:
        return -ENOMEM;
@@ -117,6 +188,10 @@ xfs_init_procfs(void)
 void
 xfs_cleanup_procfs(void)
 {
+#ifdef CONFIG_XFS_QUOTA
+       remove_proc_entry("fs/xfs/xqm", NULL);
+       remove_proc_entry("fs/xfs/xqmstat", NULL);
+#endif
        remove_proc_entry("fs/xfs/stat", NULL);
        remove_proc_entry("fs/xfs", NULL);
 }
index 736854b1ca1a0c6d1d92488f6cc79891241b8a6d..c03ad38ceaebaeb8b82a2e0fae785cc592e9adaf 100644 (file)
@@ -183,6 +183,16 @@ struct xfsstats {
        __uint32_t              xs_ibt_2_alloc;
        __uint32_t              xs_ibt_2_free;
        __uint32_t              xs_ibt_2_moves;
+#define XFSSTAT_END_XQMSTAT            (XFSSTAT_END_IBT_V2+6)
+       __uint32_t              xs_qm_dqreclaims;
+       __uint32_t              xs_qm_dqreclaim_misses;
+       __uint32_t              xs_qm_dquot_dups;
+       __uint32_t              xs_qm_dqcachemisses;
+       __uint32_t              xs_qm_dqcachehits;
+       __uint32_t              xs_qm_dqwants;
+#define XFSSTAT_END_QM                 (XFSSTAT_END_XQMSTAT+2)
+       __uint32_t              xs_qm_dquot;
+       __uint32_t              xs_qm_dquot_unused;
 /* Extra precision counters */
        __uint64_t              xs_xstrat_bytes;
        __uint64_t              xs_write_bytes;
index baf40e378d35372e7f8c7f99f62b9a02767b3153..912442cf0f82c3a285fa7d49cab22baa03553f53 100644 (file)
@@ -324,10 +324,9 @@ xfs_parseargs(
                } else if (!strcmp(this_char, MNTOPT_FILESTREAM)) {
                        mp->m_flags |= XFS_MOUNT_FILESTREAMS;
                } else if (!strcmp(this_char, MNTOPT_NOQUOTA)) {
-                       mp->m_qflags &= ~(XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
-                                         XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
-                                         XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
-                                         XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD);
+                       mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT;
+                       mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD;
+                       mp->m_qflags &= ~XFS_ALL_QUOTA_ACTIVE;
                } else if (!strcmp(this_char, MNTOPT_QUOTA) ||
                           !strcmp(this_char, MNTOPT_UQUOTA) ||
                           !strcmp(this_char, MNTOPT_USRQUOTA)) {
@@ -760,6 +759,36 @@ xfs_setup_devices(
        return 0;
 }
 
+STATIC int
+xfs_init_mount_workqueues(
+       struct xfs_mount        *mp)
+{
+       mp->m_data_workqueue = alloc_workqueue("xfs-data/%s",
+                       WQ_MEM_RECLAIM, 0, mp->m_fsname);
+       if (!mp->m_data_workqueue)
+               goto out;
+
+       mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s",
+                       WQ_MEM_RECLAIM, 0, mp->m_fsname);
+       if (!mp->m_unwritten_workqueue)
+               goto out_destroy_data_iodone_queue;
+
+       return 0;
+
+out_destroy_data_iodone_queue:
+       destroy_workqueue(mp->m_data_workqueue);
+out:
+       return -ENOMEM;
+}
+
+STATIC void
+xfs_destroy_mount_workqueues(
+       struct xfs_mount        *mp)
+{
+       destroy_workqueue(mp->m_data_workqueue);
+       destroy_workqueue(mp->m_unwritten_workqueue);
+}
+
 /* Catch misguided souls that try to use this interface on XFS */
 STATIC struct inode *
 xfs_fs_alloc_inode(
@@ -834,91 +863,58 @@ xfs_fs_inode_init_once(
 }
 
 /*
- * Dirty the XFS inode when mark_inode_dirty_sync() is called so that
- * we catch unlogged VFS level updates to the inode.
+ * This is called by the VFS when dirtying inode metadata.  This can happen
+ * for a few reasons, but we only care about timestamp updates, given that
+ * we handled the rest ourselves.  In theory no other calls should happen,
+ * but for example generic_write_end() keeps dirtying the inode after
+ * updating i_size.  Thus we check that the flags are exactly I_DIRTY_SYNC,
+ * and skip this call otherwise.
  *
- * We need the barrier() to maintain correct ordering between unlogged
- * updates and the transaction commit code that clears the i_update_core
- * field. This requires all updates to be completed before marking the
- * inode dirty.
+ * We'll hopefull get a different method just for updating timestamps soon,
+ * at which point this hack can go away, and maybe we'll also get real
+ * error handling here.
  */
 STATIC void
 xfs_fs_dirty_inode(
-       struct inode    *inode,
-       int             flags)
-{
-       barrier();
-       XFS_I(inode)->i_update_core = 1;
-}
-
-STATIC int
-xfs_fs_write_inode(
        struct inode            *inode,
-       struct writeback_control *wbc)
+       int                     flags)
 {
        struct xfs_inode        *ip = XFS_I(inode);
        struct xfs_mount        *mp = ip->i_mount;
-       int                     error = EAGAIN;
-
-       trace_xfs_write_inode(ip);
-
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return -XFS_ERROR(EIO);
-
-       if (wbc->sync_mode == WB_SYNC_ALL || wbc->for_kupdate) {
-               /*
-                * Make sure the inode has made it it into the log.  Instead
-                * of forcing it all the way to stable storage using a
-                * synchronous transaction we let the log force inside the
-                * ->sync_fs call do that for thus, which reduces the number
-                * of synchronous log forces dramatically.
-                */
-               error = xfs_log_dirty_inode(ip, NULL, 0);
-               if (error)
-                       goto out;
-               return 0;
-       } else {
-               if (!ip->i_update_core)
-                       return 0;
+       struct xfs_trans        *tp;
+       int                     error;
 
-               /*
-                * We make this non-blocking if the inode is contended, return
-                * EAGAIN to indicate to the caller that they did not succeed.
-                * This prevents the flush path from blocking on inodes inside
-                * another operation right now, they get caught later by
-                * xfs_sync.
-                */
-               if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
-                       goto out;
+       if (flags != I_DIRTY_SYNC)
+               return;
 
-               if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
-                       goto out_unlock;
+       trace_xfs_dirty_inode(ip);
 
-               /*
-                * Now we have the flush lock and the inode is not pinned, we
-                * can check if the inode is really clean as we know that
-                * there are no pending transaction completions, it is not
-                * waiting on the delayed write queue and there is no IO in
-                * progress.
-                */
-               if (xfs_inode_clean(ip)) {
-                       xfs_ifunlock(ip);
-                       error = 0;
-                       goto out_unlock;
-               }
-               error = xfs_iflush(ip, SYNC_TRYLOCK);
+       tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+       error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
+       if (error) {
+               xfs_trans_cancel(tp, 0);
+               goto trouble;
        }
-
- out_unlock:
-       xfs_iunlock(ip, XFS_ILOCK_SHARED);
- out:
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
        /*
-        * if we failed to write out the inode then mark
-        * it dirty again so we'll try again later.
+        * Grab all the latest timestamps from the Linux inode.
         */
+       ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
+       ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
+       ip->i_d.di_ctime.t_sec = (__int32_t)inode->i_ctime.tv_sec;
+       ip->i_d.di_ctime.t_nsec = (__int32_t)inode->i_ctime.tv_nsec;
+       ip->i_d.di_mtime.t_sec = (__int32_t)inode->i_mtime.tv_sec;
+       ip->i_d.di_mtime.t_nsec = (__int32_t)inode->i_mtime.tv_nsec;
+
+       xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+       xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP);
+       error = xfs_trans_commit(tp, 0);
        if (error)
-               xfs_mark_inode_dirty_sync(ip);
-       return -error;
+               goto trouble;
+       return;
+
+trouble:
+       xfs_warn(mp, "failed to update timestamps for inode 0x%llx", ip->i_ino);
 }
 
 STATIC void
@@ -983,6 +979,7 @@ xfs_fs_put_super(
        xfs_unmountfs(mp);
        xfs_freesb(mp);
        xfs_icsb_destroy_counters(mp);
+       xfs_destroy_mount_workqueues(mp);
        xfs_close_devices(mp);
        xfs_free_fsname(mp);
        kfree(mp);
@@ -1309,10 +1306,14 @@ xfs_fs_fill_super(
        if (error)
                goto out_free_fsname;
 
-       error = xfs_icsb_init_counters(mp);
+       error = xfs_init_mount_workqueues(mp);
        if (error)
                goto out_close_devices;
 
+       error = xfs_icsb_init_counters(mp);
+       if (error)
+               goto out_destroy_workqueues;
+
        error = xfs_readsb(mp, flags);
        if (error)
                goto out_destroy_counters;
@@ -1376,6 +1377,8 @@ xfs_fs_fill_super(
        xfs_freesb(mp);
  out_destroy_counters:
        xfs_icsb_destroy_counters(mp);
+out_destroy_workqueues:
+       xfs_destroy_mount_workqueues(mp);
  out_close_devices:
        xfs_close_devices(mp);
  out_free_fsname:
@@ -1429,7 +1432,6 @@ static const struct super_operations xfs_super_operations = {
        .alloc_inode            = xfs_fs_alloc_inode,
        .destroy_inode          = xfs_fs_destroy_inode,
        .dirty_inode            = xfs_fs_dirty_inode,
-       .write_inode            = xfs_fs_write_inode,
        .evict_inode            = xfs_fs_evict_inode,
        .put_super              = xfs_fs_put_super,
        .sync_fs                = xfs_fs_sync_fs,
@@ -1651,13 +1653,17 @@ init_xfs_fs(void)
        if (error)
                goto out_cleanup_procfs;
 
-       vfs_initquota();
+       error = xfs_qm_init();
+       if (error)
+               goto out_sysctl_unregister;
 
        error = register_filesystem(&xfs_fs_type);
        if (error)
-               goto out_sysctl_unregister;
+               goto out_qm_exit;
        return 0;
 
+ out_qm_exit:
+       xfs_qm_exit();
  out_sysctl_unregister:
        xfs_sysctl_unregister();
  out_cleanup_procfs:
@@ -1679,7 +1685,7 @@ init_xfs_fs(void)
 STATIC void __exit
 exit_xfs_fs(void)
 {
-       vfs_exitquota();
+       xfs_qm_exit();
        unregister_filesystem(&xfs_fs_type);
        xfs_sysctl_unregister();
        xfs_cleanup_procfs();
index 50a3266c999e591dee1728aa02173c862bd73838..09b0c26b2245ebd245c2d8fcf4849105d1e5d400 100644 (file)
 #include <linux/exportfs.h>
 
 #ifdef CONFIG_XFS_QUOTA
-extern void xfs_qm_init(void);
+extern int xfs_qm_init(void);
 extern void xfs_qm_exit(void);
-# define vfs_initquota()       xfs_qm_init()
-# define vfs_exitquota()       xfs_qm_exit()
 #else
-# define vfs_initquota()       do { } while (0)
-# define vfs_exitquota()       do { } while (0)
+# define xfs_qm_init() (0)
+# define xfs_qm_exit() do { } while (0)
 #endif
 
 #ifdef CONFIG_XFS_POSIX_ACL
index 40b75eecd2b4b376253e0e9408e42bc475e63f9b..205ebcb34d9e499f732423bd00c1204bb89f6ef8 100644 (file)
@@ -336,32 +336,6 @@ xfs_sync_fsdata(
        return error;
 }
 
-int
-xfs_log_dirty_inode(
-       struct xfs_inode        *ip,
-       struct xfs_perag        *pag,
-       int                     flags)
-{
-       struct xfs_mount        *mp = ip->i_mount;
-       struct xfs_trans        *tp;
-       int                     error;
-
-       if (!ip->i_update_core)
-               return 0;
-
-       tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
-       error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
-       if (error) {
-               xfs_trans_cancel(tp, 0);
-               return error;
-       }
-
-       xfs_ilock(ip, XFS_ILOCK_EXCL);
-       xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-       return xfs_trans_commit(tp, 0);
-}
-
 /*
  * When remounting a filesystem read-only or freezing the filesystem, we have
  * two phases to execute. This first phase is syncing the data before we
@@ -385,16 +359,6 @@ xfs_quiesce_data(
 {
        int                     error, error2 = 0;
 
-       /*
-        * Log all pending size and timestamp updates.  The vfs writeback
-        * code is supposed to do this, but due to its overagressive
-        * livelock detection it will skip inodes where appending writes
-        * were written out in the first non-blocking sync phase if their
-        * completion took long enough that it happened after taking the
-        * timestamp for the cut-off in the blocking phase.
-        */
-       xfs_inode_ag_iterator(mp, xfs_log_dirty_inode, 0);
-
        /* force out the log */
        xfs_log_force(mp, XFS_LOG_SYNC);
 
@@ -913,17 +877,15 @@ reclaim:
         * can reference the inodes in the cache without taking references.
         *
         * We make that OK here by ensuring that we wait until the inode is
-        * unlocked after the lookup before we go ahead and free it.  We get
-        * both the ilock and the iolock because the code may need to drop the
-        * ilock one but will still hold the iolock.
+        * unlocked after the lookup before we go ahead and free it.
         */
-       xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
        xfs_qm_dqdetach(ip);
-       xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
 
        xfs_inode_free(ip);
-       return error;
 
+       return error;
 }
 
 /*
index fa965479d788d29da66b0e85bd59123c1fe08c65..941202e7ac6e594e2c423c19bc89248397e39516 100644 (file)
@@ -34,8 +34,6 @@ void xfs_quiesce_attr(struct xfs_mount *mp);
 
 void xfs_flush_inodes(struct xfs_inode *ip);
 
-int xfs_log_dirty_inode(struct xfs_inode *ip, struct xfs_perag *pag, int flags);
-
 int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
 int xfs_reclaim_inodes_count(struct xfs_mount *mp);
 void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
index bb134a819930c72448e37cc62de3fb98ec971835..75eb54af4d581e7f4cc9270f0abb1c72238195ac 100644 (file)
@@ -580,7 +580,7 @@ DEFINE_INODE_EVENT(xfs_ioctl_setattr);
 DEFINE_INODE_EVENT(xfs_dir_fsync);
 DEFINE_INODE_EVENT(xfs_file_fsync);
 DEFINE_INODE_EVENT(xfs_destroy_inode);
-DEFINE_INODE_EVENT(xfs_write_inode);
+DEFINE_INODE_EVENT(xfs_dirty_inode);
 DEFINE_INODE_EVENT(xfs_evict_inode);
 
 DEFINE_INODE_EVENT(xfs_dquot_dqalloc);
@@ -741,10 +741,10 @@ DEFINE_DQUOT_EVENT(xfs_dqalloc);
 DEFINE_DQUOT_EVENT(xfs_dqtobp_read);
 DEFINE_DQUOT_EVENT(xfs_dqread);
 DEFINE_DQUOT_EVENT(xfs_dqread_fail);
-DEFINE_DQUOT_EVENT(xfs_dqlookup_found);
-DEFINE_DQUOT_EVENT(xfs_dqlookup_done);
 DEFINE_DQUOT_EVENT(xfs_dqget_hit);
 DEFINE_DQUOT_EVENT(xfs_dqget_miss);
+DEFINE_DQUOT_EVENT(xfs_dqget_freeing);
+DEFINE_DQUOT_EVENT(xfs_dqget_dup);
 DEFINE_DQUOT_EVENT(xfs_dqput);
 DEFINE_DQUOT_EVENT(xfs_dqput_wait);
 DEFINE_DQUOT_EVENT(xfs_dqput_free);
@@ -782,12 +782,12 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
                __entry->curr_res = tic->t_curr_res;
                __entry->unit_res = tic->t_unit_res;
                __entry->flags = tic->t_flags;
-               __entry->reserveq = list_empty(&log->l_reserveq);
-               __entry->writeq = list_empty(&log->l_writeq);
-               xlog_crack_grant_head(&log->l_grant_reserve_head,
+               __entry->reserveq = list_empty(&log->l_reserve_head.waiters);
+               __entry->writeq = list_empty(&log->l_write_head.waiters);
+               xlog_crack_grant_head(&log->l_reserve_head.grant,
                                &__entry->grant_reserve_cycle,
                                &__entry->grant_reserve_bytes);
-               xlog_crack_grant_head(&log->l_grant_write_head,
+               xlog_crack_grant_head(&log->l_write_head.grant,
                                &__entry->grant_write_cycle,
                                &__entry->grant_write_bytes);
                __entry->curr_cycle = log->l_curr_cycle;
@@ -826,20 +826,14 @@ DEFINE_EVENT(xfs_loggrant_class, name, \
        TP_ARGS(log, tic))
 DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm);
 DEFINE_LOGGRANT_EVENT(xfs_log_done_perm);
-DEFINE_LOGGRANT_EVENT(xfs_log_reserve);
 DEFINE_LOGGRANT_EVENT(xfs_log_umount_write);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_enter);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_exit);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_error);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
+DEFINE_LOGGRANT_EVENT(xfs_log_reserve);
+DEFINE_LOGGRANT_EVENT(xfs_log_reserve_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
index 7adcdf15ae0ce563b32eb37582fbdd79a8fdba15..103b00c90004940e40c8b62fe83af2c9f3e93443 100644 (file)
@@ -681,7 +681,6 @@ xfs_trans_reserve(
        uint            flags,
        uint            logcount)
 {
-       int             log_flags;
        int             error = 0;
        int             rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
 
@@ -707,24 +706,32 @@ xfs_trans_reserve(
         * Reserve the log space needed for this transaction.
         */
        if (logspace > 0) {
-               ASSERT((tp->t_log_res == 0) || (tp->t_log_res == logspace));
-               ASSERT((tp->t_log_count == 0) ||
-                       (tp->t_log_count == logcount));
+               bool    permanent = false;
+
+               ASSERT(tp->t_log_res == 0 || tp->t_log_res == logspace);
+               ASSERT(tp->t_log_count == 0 || tp->t_log_count == logcount);
+
                if (flags & XFS_TRANS_PERM_LOG_RES) {
-                       log_flags = XFS_LOG_PERM_RESERV;
                        tp->t_flags |= XFS_TRANS_PERM_LOG_RES;
+                       permanent = true;
                } else {
                        ASSERT(tp->t_ticket == NULL);
                        ASSERT(!(tp->t_flags & XFS_TRANS_PERM_LOG_RES));
-                       log_flags = 0;
                }
 
-               error = xfs_log_reserve(tp->t_mountp, logspace, logcount,
-                                       &tp->t_ticket,
-                                       XFS_TRANSACTION, log_flags, tp->t_type);
-               if (error) {
-                       goto undo_blocks;
+               if (tp->t_ticket != NULL) {
+                       ASSERT(flags & XFS_TRANS_PERM_LOG_RES);
+                       error = xfs_log_regrant(tp->t_mountp, tp->t_ticket);
+               } else {
+                       error = xfs_log_reserve(tp->t_mountp, logspace,
+                                               logcount, &tp->t_ticket,
+                                               XFS_TRANSACTION, permanent,
+                                               tp->t_type);
                }
+
+               if (error)
+                       goto undo_blocks;
+
                tp->t_log_res = logspace;
                tp->t_log_count = logcount;
        }
@@ -752,6 +759,8 @@ xfs_trans_reserve(
         */
 undo_log:
        if (logspace > 0) {
+               int             log_flags;
+
                if (flags & XFS_TRANS_PERM_LOG_RES) {
                        log_flags = XFS_LOG_REL_PERM_RESERV;
                } else {
index ed9252bcdac9c351a79020451cf76eb4287992f7..1dead07f092c92afbea3ceeb22e0cf70430fa1fb 100644 (file)
@@ -610,50 +610,6 @@ xfs_ail_push_all(
                xfs_ail_push(ailp, threshold_lsn);
 }
 
-/*
- * This is to be called when an item is unlocked that may have
- * been in the AIL.  It will wake up the first member of the AIL
- * wait list if this item's unlocking might allow it to progress.
- * If the item is in the AIL, then we need to get the AIL lock
- * while doing our checking so we don't race with someone going
- * to sleep waiting for this event in xfs_trans_push_ail().
- */
-void
-xfs_trans_unlocked_item(
-       struct xfs_ail  *ailp,
-       xfs_log_item_t  *lip)
-{
-       xfs_log_item_t  *min_lip;
-
-       /*
-        * If we're forcibly shutting down, we may have
-        * unlocked log items arbitrarily. The last thing
-        * we want to do is to move the tail of the log
-        * over some potentially valid data.
-        */
-       if (!(lip->li_flags & XFS_LI_IN_AIL) ||
-           XFS_FORCED_SHUTDOWN(ailp->xa_mount)) {
-               return;
-       }
-
-       /*
-        * This is the one case where we can call into xfs_ail_min()
-        * without holding the AIL lock because we only care about the
-        * case where we are at the tail of the AIL.  If the object isn't
-        * at the tail, it doesn't matter what result we get back.  This
-        * is slightly racy because since we were just unlocked, we could
-        * go to sleep between the call to xfs_ail_min and the call to
-        * xfs_log_move_tail, have someone else lock us, commit to us disk,
-        * move us out of the tail of the AIL, and then we wake up.  However,
-        * the call to xfs_log_move_tail() doesn't do anything if there's
-        * not enough free space to wake people up so we're safe calling it.
-        */
-       min_lip = xfs_ail_min(ailp);
-
-       if (min_lip == lip)
-               xfs_log_move_tail(ailp->xa_mount, 1);
-}      /* xfs_trans_unlocked_item */
-
 /*
  * xfs_trans_ail_update - bulk AIL insertion operation.
  *
@@ -685,7 +641,6 @@ xfs_trans_ail_update_bulk(
        xfs_lsn_t               lsn) __releases(ailp->xa_lock)
 {
        xfs_log_item_t          *mlip;
-       xfs_lsn_t               tail_lsn;
        int                     mlip_changed = 0;
        int                     i;
        LIST_HEAD(tmp);
@@ -712,22 +667,12 @@ xfs_trans_ail_update_bulk(
 
        if (!list_empty(&tmp))
                xfs_ail_splice(ailp, cur, &tmp, lsn);
+       spin_unlock(&ailp->xa_lock);
 
-       if (!mlip_changed) {
-               spin_unlock(&ailp->xa_lock);
-               return;
+       if (mlip_changed && !XFS_FORCED_SHUTDOWN(ailp->xa_mount)) {
+               xlog_assign_tail_lsn(ailp->xa_mount);
+               xfs_log_space_wake(ailp->xa_mount);
        }
-
-       /*
-        * It is not safe to access mlip after the AIL lock is dropped, so we
-        * must get a copy of li_lsn before we do so.  This is especially
-        * important on 32-bit platforms where accessing and updating 64-bit
-        * values like li_lsn is not atomic.
-        */
-       mlip = xfs_ail_min(ailp);
-       tail_lsn = mlip->li_lsn;
-       spin_unlock(&ailp->xa_lock);
-       xfs_log_move_tail(ailp->xa_mount, tail_lsn);
 }
 
 /*
@@ -758,7 +703,6 @@ xfs_trans_ail_delete_bulk(
        int                     nr_items) __releases(ailp->xa_lock)
 {
        xfs_log_item_t          *mlip;
-       xfs_lsn_t               tail_lsn;
        int                     mlip_changed = 0;
        int                     i;
 
@@ -785,23 +729,12 @@ xfs_trans_ail_delete_bulk(
                if (mlip == lip)
                        mlip_changed = 1;
        }
+       spin_unlock(&ailp->xa_lock);
 
-       if (!mlip_changed) {
-               spin_unlock(&ailp->xa_lock);
-               return;
+       if (mlip_changed && !XFS_FORCED_SHUTDOWN(ailp->xa_mount)) {
+               xlog_assign_tail_lsn(ailp->xa_mount);
+               xfs_log_space_wake(ailp->xa_mount);
        }
-
-       /*
-        * It is not safe to access mlip after the AIL lock is dropped, so we
-        * must get a copy of li_lsn before we do so.  This is especially
-        * important on 32-bit platforms where accessing and updating 64-bit
-        * values like li_lsn is not atomic. It is possible we've emptied the
-        * AIL here, so if that is the case, pass an LSN of 0 to the tail move.
-        */
-       mlip = xfs_ail_min(ailp);
-       tail_lsn = mlip ? mlip->li_lsn : 0;
-       spin_unlock(&ailp->xa_lock);
-       xfs_log_move_tail(ailp->xa_mount, tail_lsn);
 }
 
 /*
index 475a4ded4f41a875ff0f77ee7990b969cbc3f124..1302d1d95a5850d121af792719b04d2d5672af39 100644 (file)
@@ -463,19 +463,7 @@ xfs_trans_brelse(xfs_trans_t       *tp,
         * Default to a normal brelse() call if the tp is NULL.
         */
        if (tp == NULL) {
-               struct xfs_log_item     *lip = bp->b_fspriv;
-
                ASSERT(bp->b_transp == NULL);
-
-               /*
-                * If there's a buf log item attached to the buffer,
-                * then let the AIL know that the buffer is being
-                * unlocked.
-                */
-               if (lip != NULL && lip->li_type == XFS_LI_BUF) {
-                       bip = bp->b_fspriv;
-                       xfs_trans_unlocked_item(bip->bli_item.li_ailp, lip);
-               }
                xfs_buf_relse(bp);
                return;
        }
@@ -550,21 +538,10 @@ xfs_trans_brelse(xfs_trans_t      *tp,
                ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL));
                ASSERT(!(bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF));
                xfs_buf_item_relse(bp);
-               bip = NULL;
-       }
-       bp->b_transp = NULL;
-
-       /*
-        * If we've still got a buf log item on the buffer, then
-        * tell the AIL that the buffer is being unlocked.
-        */
-       if (bip != NULL) {
-               xfs_trans_unlocked_item(bip->bli_item.li_ailp,
-                                       (xfs_log_item_t*)bip);
        }
 
+       bp->b_transp = NULL;
        xfs_buf_relse(bp);
-       return;
 }
 
 /*
index c4ba366d24e65c8fde43fa7a6ca32b4735bce331..279099717ed2db8a11ab5f6a05ba76033254d921 100644 (file)
@@ -605,7 +605,7 @@ xfs_trans_dqresv(
        time_t          timer;
        xfs_qwarncnt_t  warns;
        xfs_qwarncnt_t  warnlimit;
-       xfs_qcnt_t      count;
+       xfs_qcnt_t      total_count;
        xfs_qcnt_t      *resbcountp;
        xfs_quotainfo_t *q = mp->m_quotainfo;
 
@@ -648,13 +648,12 @@ xfs_trans_dqresv(
                         * hardlimit or exceed the timelimit if we allocate
                         * nblks.
                         */
-                       if (hardlimit > 0ULL &&
-                           hardlimit < nblks + *resbcountp) {
+                       total_count = *resbcountp + nblks;
+                       if (hardlimit && total_count > hardlimit) {
                                xfs_quota_warn(mp, dqp, QUOTA_NL_BHARDWARN);
                                goto error_return;
                        }
-                       if (softlimit > 0ULL &&
-                           softlimit < nblks + *resbcountp) {
+                       if (softlimit && total_count > softlimit) {
                                if ((timer != 0 && get_seconds() > timer) ||
                                    (warns != 0 && warns >= warnlimit)) {
                                        xfs_quota_warn(mp, dqp,
@@ -666,7 +665,7 @@ xfs_trans_dqresv(
                        }
                }
                if (ninos > 0) {
-                       count = be64_to_cpu(dqp->q_core.d_icount);
+                       total_count = be64_to_cpu(dqp->q_core.d_icount) + ninos;
                        timer = be32_to_cpu(dqp->q_core.d_itimer);
                        warns = be16_to_cpu(dqp->q_core.d_iwarns);
                        warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit;
@@ -677,13 +676,11 @@ xfs_trans_dqresv(
                        if (!softlimit)
                                softlimit = q->qi_isoftlimit;
 
-                       if (hardlimit > 0ULL &&
-                           hardlimit < ninos + count) {
+                       if (hardlimit && total_count > hardlimit) {
                                xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN);
                                goto error_return;
                        }
-                       if (softlimit > 0ULL &&
-                           softlimit < ninos + count) {
+                       if (softlimit && total_count > softlimit) {
                                if  ((timer != 0 && get_seconds() > timer) ||
                                     (warns != 0 && warns >= warnlimit)) {
                                        xfs_quota_warn(mp, dqp,
@@ -878,7 +875,7 @@ STATIC void
 xfs_trans_alloc_dqinfo(
        xfs_trans_t     *tp)
 {
-       tp->t_dqinfo = kmem_zone_zalloc(xfs_Gqm->qm_dqtrxzone, KM_SLEEP);
+       tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, KM_SLEEP);
 }
 
 void
@@ -887,6 +884,6 @@ xfs_trans_free_dqinfo(
 {
        if (!tp->t_dqinfo)
                return;
-       kmem_zone_free(xfs_Gqm->qm_dqtrxzone, tp->t_dqinfo);
+       kmem_zone_free(xfs_qm_dqtrxzone, tp->t_dqinfo);
        tp->t_dqinfo = NULL;
 }
index 32f0288ae10f30abe1f67d3ebcb766926fe8de7b..7a7442c03f2bfde90606a7e82e78fdd4d2363e6e 100644 (file)
@@ -95,10 +95,14 @@ xfs_trans_ichgtime(
        if ((flags & XFS_ICHGTIME_MOD) &&
            !timespec_equal(&inode->i_mtime, &tv)) {
                inode->i_mtime = tv;
+               ip->i_d.di_mtime.t_sec = tv.tv_sec;
+               ip->i_d.di_mtime.t_nsec = tv.tv_nsec;
        }
        if ((flags & XFS_ICHGTIME_CHG) &&
            !timespec_equal(&inode->i_ctime, &tv)) {
                inode->i_ctime = tv;
+               ip->i_d.di_ctime.t_sec = tv.tv_sec;
+               ip->i_d.di_ctime.t_nsec = tv.tv_nsec;
        }
 }
 
@@ -126,12 +130,12 @@ xfs_trans_log_inode(
        /*
         * Always OR in the bits from the ili_last_fields field.
         * This is to coordinate with the xfs_iflush() and xfs_iflush_done()
-        * routines in the eventual clearing of the ilf_fields bits.
+        * routines in the eventual clearing of the ili_fields bits.
         * See the big comment in xfs_iflush() for an explanation of
         * this coordination mechanism.
         */
        flags |= ip->i_itemp->ili_last_fields;
-       ip->i_itemp->ili_format.ilf_fields |= flags;
+       ip->i_itemp->ili_fields |= flags;
 }
 
 #ifdef XFS_TRANS_DEBUG
index 44820b9fcb4327f9d7d3b14b626e67f19bb18973..8ab2ced415f1d2680c274d6370047f749e74bce7 100644 (file)
@@ -104,9 +104,6 @@ void                        xfs_ail_push(struct xfs_ail *, xfs_lsn_t);
 void                   xfs_ail_push_all(struct xfs_ail *);
 xfs_lsn_t              xfs_ail_min_lsn(struct xfs_ail *ailp);
 
-void                   xfs_trans_unlocked_item(struct xfs_ail *,
-                                       xfs_log_item_t *);
-
 struct xfs_log_item *  xfs_trans_ail_cursor_first(struct xfs_ail *ailp,
                                        struct xfs_ail_cursor *cur,
                                        xfs_lsn_t lsn);
index 7c220b4227bc7f817cf507b84a1721a7d0091e2f..db14d0c08682b90949f031532069f715ca628804 100644 (file)
@@ -22,7 +22,6 @@
 
 struct file;
 struct xfs_inode;
-struct xfs_iomap;
 struct attrlist_cursor_kern;
 
 /*
index 0c877cbde142f8427690fd519264c68951c10b18..447e146b2ba6d8ae1a9213841f6daae5a209f487 100644 (file)
@@ -10,7 +10,6 @@ struct kiocb;
 struct pipe_inode_info;
 struct uio;
 struct xfs_inode;
-struct xfs_iomap;
 
 
 int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap, int flags);
@@ -49,8 +48,6 @@ int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
 int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
 int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
                int flags, struct attrlist_cursor_kern *cursor);
-int xfs_bmap(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
-               int flags, struct xfs_iomap *iomapp, int *niomaps);
 void xfs_tosspages(struct xfs_inode *inode, xfs_off_t first,
                xfs_off_t last, int fiopt);
 int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first,