]> git.openfabrics.org - ~shefty/rdma-dev.git/commitdiff
Merge branch 'restriper' of git://github.com/idryomov/btrfs-unstable into integration
authorChris Mason <chris.mason@oracle.com>
Mon, 16 Jan 2012 20:26:02 +0000 (15:26 -0500)
committerChris Mason <chris.mason@oracle.com>
Mon, 16 Jan 2012 20:26:02 +0000 (15:26 -0500)
1  2 
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/super.c
fs/btrfs/volumes.c

diff --combined fs/btrfs/disk-io.c
index f44b3928dc2dc94cb62cefe72f1063f282dc09c2,c23b82d8ec08aa5e90553c481451dfb9d1bd555c..9c1a744e595b076868e966867a6bd55e5f6376cd
@@@ -2002,12 -2002,20 +2002,20 @@@ struct btrfs_root *open_ctree(struct su
        init_rwsem(&fs_info->scrub_super_lock);
        fs_info->scrub_workers_refcnt = 0;
  
+       spin_lock_init(&fs_info->balance_lock);
+       mutex_init(&fs_info->balance_mutex);
+       atomic_set(&fs_info->balance_running, 0);
+       atomic_set(&fs_info->balance_pause_req, 0);
+       atomic_set(&fs_info->balance_cancel_req, 0);
+       fs_info->balance_ctl = NULL;
+       init_waitqueue_head(&fs_info->balance_wait_q);
        sb->s_blocksize = 4096;
        sb->s_blocksize_bits = blksize_bits(4096);
        sb->s_bdi = &fs_info->bdi;
  
        fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
 -      fs_info->btree_inode->i_nlink = 1;
 +      set_nlink(fs_info->btree_inode, 1);
        /*
         * we set the i_size on the btree inode to the max possible int.
         * the real end of the address space is determined by all of
@@@ -2321,9 -2329,6 +2329,6 @@@ retry_root_backup
  
        fs_info->generation = generation;
        fs_info->last_trans_committed = generation;
-       fs_info->data_alloc_profile = (u64)-1;
-       fs_info->metadata_alloc_profile = (u64)-1;
-       fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
  
        ret = btrfs_init_space_info(fs_info);
        if (ret) {
                if (!err)
                        err = btrfs_orphan_cleanup(fs_info->tree_root);
                up_read(&fs_info->cleanup_work_sem);
+               if (!err)
+                       err = btrfs_recover_balance(fs_info->tree_root);
                if (err) {
                        close_ctree(tree_root);
                        return ERR_PTR(err);
@@@ -2975,6 -2984,9 +2984,9 @@@ int close_ctree(struct btrfs_root *root
        fs_info->closing = 1;
        smp_mb();
  
+       /* pause restriper - we want to resume on mount */
+       btrfs_pause_balance(root->fs_info);
        btrfs_scrub_cancel(root);
  
        /* wait for any defraggers to finish */
diff --combined fs/btrfs/extent-tree.c
index 37594e4bf660e166b5c3d2d37ff78a459076120d,e6a832e3e647d3ab11a75cc2a4361c69284f60ed..352083ad233ca2a18b8c3817d3f3a5644000bf76
@@@ -618,8 -618,7 +618,7 @@@ static struct btrfs_space_info *__find_
        struct list_head *head = &info->space_info;
        struct btrfs_space_info *found;
  
-       flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
-                BTRFS_BLOCK_GROUP_METADATA;
+       flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
  
        rcu_read_lock();
        list_for_each_entry_rcu(found, head, list) {
@@@ -2267,7 -2266,9 +2266,7 @@@ static noinline int run_clustered_refs(
                                BUG_ON(ret);
                                kfree(extent_op);
  
 -                              cond_resched();
 -                              spin_lock(&delayed_refs->lock);
 -                              continue;
 +                              goto next;
                        }
  
                        list_del_init(&locked_ref->cluster);
                btrfs_put_delayed_ref(ref);
                kfree(extent_op);
                count++;
 -
 +next:
 +              do_chunk_alloc(trans, root->fs_info->extent_root,
 +                             2 * 1024 * 1024,
 +                             btrfs_get_alloc_profile(root, 0),
 +                             CHUNK_ALLOC_NO_FORCE);
                cond_resched();
                spin_lock(&delayed_refs->lock);
        }
@@@ -2319,10 -2316,6 +2318,10 @@@ int btrfs_run_delayed_refs(struct btrfs
        if (root == root->fs_info->extent_root)
                root = root->fs_info->tree_root;
  
 +      do_chunk_alloc(trans, root->fs_info->extent_root,
 +                     2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
 +                     CHUNK_ALLOC_NO_FORCE);
 +
        delayed_refs = &trans->transaction->delayed_refs;
        INIT_LIST_HEAD(&cluster);
  again:
@@@ -2999,9 -2992,7 +2998,7 @@@ static int update_space_info(struct btr
                INIT_LIST_HEAD(&found->block_groups[i]);
        init_rwsem(&found->groups_sem);
        spin_lock_init(&found->lock);
-       found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
-                               BTRFS_BLOCK_GROUP_SYSTEM |
-                               BTRFS_BLOCK_GROUP_METADATA);
+       found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
        found->total_bytes = total_bytes;
        found->disk_total = total_bytes * factor;
        found->bytes_used = bytes_used;
  
  static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
  {
-       u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
-                                  BTRFS_BLOCK_GROUP_RAID1 |
-                                  BTRFS_BLOCK_GROUP_RAID10 |
-                                  BTRFS_BLOCK_GROUP_DUP);
-       if (extra_flags) {
-               if (flags & BTRFS_BLOCK_GROUP_DATA)
-                       fs_info->avail_data_alloc_bits |= extra_flags;
-               if (flags & BTRFS_BLOCK_GROUP_METADATA)
-                       fs_info->avail_metadata_alloc_bits |= extra_flags;
-               if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
-                       fs_info->avail_system_alloc_bits |= extra_flags;
-       }
+       u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK;
+       /* chunk -> extended profile */
+       if (extra_flags == 0)
+               extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+       if (flags & BTRFS_BLOCK_GROUP_DATA)
+               fs_info->avail_data_alloc_bits |= extra_flags;
+       if (flags & BTRFS_BLOCK_GROUP_METADATA)
+               fs_info->avail_metadata_alloc_bits |= extra_flags;
+       if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+               fs_info->avail_system_alloc_bits |= extra_flags;
  }
  
+ /*
+  * @flags: available profiles in extended format (see ctree.h)
+  *
+  * Returns reduced profile in chunk format.  If profile changing is in
+  * progress (either running or paused) picks the target profile (if it's
+  * already available), otherwise falls back to plain reducing.
+  */
  u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
  {
        /*
        u64 num_devices = root->fs_info->fs_devices->rw_devices +
                root->fs_info->fs_devices->missing_devices;
  
+       /* pick restriper's target profile if it's available */
+       spin_lock(&root->fs_info->balance_lock);
+       if (root->fs_info->balance_ctl) {
+               struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
+               u64 tgt = 0;
+               if ((flags & BTRFS_BLOCK_GROUP_DATA) &&
+                   (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+                   (flags & bctl->data.target)) {
+                       tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
+               } else if ((flags & BTRFS_BLOCK_GROUP_SYSTEM) &&
+                          (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+                          (flags & bctl->sys.target)) {
+                       tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
+               } else if ((flags & BTRFS_BLOCK_GROUP_METADATA) &&
+                          (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+                          (flags & bctl->meta.target)) {
+                       tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
+               }
+               if (tgt) {
+                       spin_unlock(&root->fs_info->balance_lock);
+                       flags = tgt;
+                       goto out;
+               }
+       }
+       spin_unlock(&root->fs_info->balance_lock);
        if (num_devices == 1)
                flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
        if (num_devices < 4)
        if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
            ((flags & BTRFS_BLOCK_GROUP_RAID1) |
             (flags & BTRFS_BLOCK_GROUP_RAID10) |
-            (flags & BTRFS_BLOCK_GROUP_DUP)))
+            (flags & BTRFS_BLOCK_GROUP_DUP))) {
                flags &= ~BTRFS_BLOCK_GROUP_RAID0;
+       }
+ out:
+       /* extended -> chunk profile */
+       flags &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
        return flags;
  }
  
  static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
  {
        if (flags & BTRFS_BLOCK_GROUP_DATA)
-               flags |= root->fs_info->avail_data_alloc_bits &
-                        root->fs_info->data_alloc_profile;
+               flags |= root->fs_info->avail_data_alloc_bits;
        else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
-               flags |= root->fs_info->avail_system_alloc_bits &
-                        root->fs_info->system_alloc_profile;
+               flags |= root->fs_info->avail_system_alloc_bits;
        else if (flags & BTRFS_BLOCK_GROUP_METADATA)
-               flags |= root->fs_info->avail_metadata_alloc_bits &
-                        root->fs_info->metadata_alloc_profile;
+               flags |= root->fs_info->avail_metadata_alloc_bits;
        return btrfs_reduce_alloc_profile(root, flags);
  }
  
@@@ -3263,12 -3292,27 +3298,12 @@@ static int should_alloc_chunk(struct bt
                if (num_bytes - num_allocated < thresh)
                        return 1;
        }
 -
 -      /*
 -       * we have two similar checks here, one based on percentage
 -       * and once based on a hard number of 256MB.  The idea
 -       * is that if we have a good amount of free
 -       * room, don't allocate a chunk.  A good mount is
 -       * less than 80% utilized of the chunks we have allocated,
 -       * or more than 256MB free
 -       */
 -      if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes)
 -              return 0;
 -
 -      if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
 -              return 0;
 -
        thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
  
 -      /* 256MB or 5% of the FS */
 -      thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
 +      /* 256MB or 2% of the FS */
 +      thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2));
  
 -      if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
 +      if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8))
                return 0;
        return 1;
  }
@@@ -3282,7 -3326,7 +3317,7 @@@ static int do_chunk_alloc(struct btrfs_
        int wait_for_alloc = 0;
        int ret = 0;
  
-       flags = btrfs_reduce_alloc_profile(extent_root, flags);
+       BUG_ON(!profile_is_valid(flags, 0));
  
        space_info = __find_space_info(extent_root->fs_info, flags);
        if (!space_info) {
@@@ -3407,8 -3451,7 +3442,8 @@@ static int shrink_delalloc(struct btrfs
                smp_mb();
                nr_pages = min_t(unsigned long, nr_pages,
                       root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
 -              writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
 +              writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
 +                                              WB_REASON_FS_FREE_SPACE);
  
                spin_lock(&space_info->lock);
                if (reserved > space_info->bytes_may_use)
@@@ -5286,6 -5329,15 +5321,6 @@@ alloc
                if (unlikely(block_group->ro))
                        goto loop;
  
 -              spin_lock(&block_group->free_space_ctl->tree_lock);
 -              if (cached &&
 -                  block_group->free_space_ctl->free_space <
 -                  num_bytes + empty_cluster + empty_size) {
 -                      spin_unlock(&block_group->free_space_ctl->tree_lock);
 -                      goto loop;
 -              }
 -              spin_unlock(&block_group->free_space_ctl->tree_lock);
 -
                /*
                 * Ok we want to try and use the cluster allocator, so
                 * lets look there
@@@ -5331,15 -5383,8 +5366,15 @@@ refill_cluster
                         * plenty of times and not have found
                         * anything, so we are likely way too
                         * fragmented for the clustering stuff to find
 -                       * anything.  */
 -                      if (loop >= LOOP_NO_EMPTY_SIZE) {
 +                       * anything.
 +                       *
 +                       * However, if the cluster is taken from the
 +                       * current block group, release the cluster
 +                       * first, so that we stand a better chance of
 +                       * succeeding in the unclustered
 +                       * allocation.  */
 +                      if (loop >= LOOP_NO_EMPTY_SIZE &&
 +                          last_ptr->block_group != block_group) {
                                spin_unlock(&last_ptr->refill_lock);
                                goto unclustered_alloc;
                        }
                         */
                        btrfs_return_cluster_to_free_space(NULL, last_ptr);
  
 +                      if (loop >= LOOP_NO_EMPTY_SIZE) {
 +                              spin_unlock(&last_ptr->refill_lock);
 +                              goto unclustered_alloc;
 +                      }
 +
                        /* allocate a cluster in this block group */
                        ret = btrfs_find_space_cluster(trans, root,
                                               block_group, last_ptr,
                }
  
  unclustered_alloc:
 +              spin_lock(&block_group->free_space_ctl->tree_lock);
 +              if (cached &&
 +                  block_group->free_space_ctl->free_space <
 +                  num_bytes + empty_cluster + empty_size) {
 +                      spin_unlock(&block_group->free_space_ctl->tree_lock);
 +                      goto loop;
 +              }
 +              spin_unlock(&block_group->free_space_ctl->tree_lock);
 +
                offset = btrfs_find_space_for_alloc(block_group, search_start,
                                                    num_bytes, empty_size);
                /*
@@@ -5441,6 -5472,9 +5476,6 @@@ checks
                        goto loop;
                }
  
 -              ins->objectid = search_start;
 -              ins->offset = num_bytes;
 -
                if (offset < search_start)
                        btrfs_add_free_space(used_block_group, offset,
                                             search_start - offset);
@@@ -6792,6 -6826,29 +6827,29 @@@ static u64 update_block_group_flags(str
        u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
                BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
  
+       if (root->fs_info->balance_ctl) {
+               struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
+               u64 tgt = 0;
+               /* pick restriper's target profile and return */
+               if (flags & BTRFS_BLOCK_GROUP_DATA &&
+                   bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+                       tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
+               } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
+                          bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+                       tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
+               } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
+                          bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+                       tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
+               }
+               if (tgt) {
+                       /* extended -> chunk profile */
+                       tgt &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+                       return tgt;
+               }
+       }
        /*
         * we add in the count of missing devices because we want
         * to make sure that any RAID levels on a degraded FS
@@@ -7466,6 -7523,22 +7524,22 @@@ int btrfs_make_block_group(struct btrfs
        return 0;
  }
  
+ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
+ {
+       u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK;
+       /* chunk -> extended profile */
+       if (extra_flags == 0)
+               extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+       if (flags & BTRFS_BLOCK_GROUP_DATA)
+               fs_info->avail_data_alloc_bits &= ~extra_flags;
+       if (flags & BTRFS_BLOCK_GROUP_METADATA)
+               fs_info->avail_metadata_alloc_bits &= ~extra_flags;
+       if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+               fs_info->avail_system_alloc_bits &= ~extra_flags;
+ }
  int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root, u64 group_start)
  {
        struct btrfs_key key;
        struct inode *inode;
        int ret;
+       int index;
        int factor;
  
        root = root->fs_info->extent_root;
        free_excluded_extents(root, block_group);
  
        memcpy(&key, &block_group->key, sizeof(key));
+       index = get_block_group_index(block_group);
        if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
                                  BTRFS_BLOCK_GROUP_RAID1 |
                                  BTRFS_BLOCK_GROUP_RAID10))
         * are still on the list after taking the semaphore
         */
        list_del_init(&block_group->list);
+       if (list_empty(&block_group->space_info->block_groups[index]))
+               clear_avail_alloc_bits(root->fs_info, block_group->flags);
        up_write(&block_group->space_info->groups_sem);
  
        if (block_group->cached == BTRFS_CACHE_STARTED)
diff --combined fs/btrfs/super.c
index 200f63bc6675eca20cf1b55c9ced534efccaf63b,063b521e3ded270510ea4b95f97e5d1821efa2ce..5a7227fa93804c7b78bb205d6af94b52c55e1b32
@@@ -164,8 -164,9 +164,9 @@@ enum 
        Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
        Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
        Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
-       Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
-       Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err,
+       Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
+       Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
+       Opt_err,
  };
  
  static match_table_t tokens = {
        {Opt_inode_cache, "inode_cache"},
        {Opt_no_space_cache, "nospace_cache"},
        {Opt_recovery, "recovery"},
+       {Opt_skip_balance, "skip_balance"},
        {Opt_err, NULL},
  };
  
@@@ -398,6 -400,9 +400,9 @@@ int btrfs_parse_options(struct btrfs_ro
                        printk(KERN_INFO "btrfs: enabling auto recovery");
                        btrfs_set_opt(info->mount_opt, RECOVERY);
                        break;
+               case Opt_skip_balance:
+                       btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
+                       break;
                case Opt_err:
                        printk(KERN_INFO "btrfs: unrecognized mount option "
                               "'%s'\n", p);
@@@ -723,6 -728,8 +728,8 @@@ static int btrfs_show_options(struct se
                seq_puts(seq, ",autodefrag");
        if (btrfs_test_opt(root, INODE_MAP_CACHE))
                seq_puts(seq, ",inode_cache");
+       if (btrfs_test_opt(root, SKIP_BALANCE))
+               seq_puts(seq, ",skip_balance");
        return 0;
  }
  
@@@ -826,9 -833,13 +833,9 @@@ static char *setup_root_args(char *args
  static struct dentry *mount_subvol(const char *subvol_name, int flags,
                                   const char *device_name, char *data)
  {
 -      struct super_block *s;
        struct dentry *root;
        struct vfsmount *mnt;
 -      struct mnt_namespace *ns_private;
        char *newargs;
 -      struct path path;
 -      int error;
  
        newargs = setup_root_args(data);
        if (!newargs)
        if (IS_ERR(mnt))
                return ERR_CAST(mnt);
  
 -      ns_private = create_mnt_ns(mnt);
 -      if (IS_ERR(ns_private)) {
 -              mntput(mnt);
 -              return ERR_CAST(ns_private);
 -      }
 +      root = mount_subtree(mnt, subvol_name);
  
 -      /*
 -       * This will trigger the automount of the subvol so we can just
 -       * drop the mnt we have here and return the dentry that we
 -       * found.
 -       */
 -      error = vfs_path_lookup(mnt->mnt_root, mnt, subvol_name,
 -                              LOOKUP_FOLLOW, &path);
 -      put_mnt_ns(ns_private);
 -      if (error)
 -              return ERR_PTR(error);
 -
 -      if (!is_subvolume_inode(path.dentry->d_inode)) {
 -              path_put(&path);
 -              mntput(mnt);
 -              error = -EINVAL;
 +      if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) {
 +              struct super_block *s = root->d_sb;
 +              dput(root);
 +              root = ERR_PTR(-EINVAL);
 +              deactivate_locked_super(s);
                printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n",
                                subvol_name);
 -              return ERR_PTR(-EINVAL);
        }
  
 -      /* Get a ref to the sb and the dentry we found and return it */
 -      s = path.mnt->mnt_sb;
 -      atomic_inc(&s->s_active);
 -      root = dget(path.dentry);
 -      path_put(&path);
 -      down_write(&s->s_umount);
 -
        return root;
  }
  
diff --combined fs/btrfs/volumes.c
index ac00e3aa80a11c4752e15576c52510befb2f2dcc,d73439b4d7da18f6a644a5f7c97d2a1a0efa759a..9489a2aca47b04bef55c159dcd5e56615ba3ee86
@@@ -23,6 -23,7 +23,7 @@@
  #include <linux/random.h>
  #include <linux/iocontext.h>
  #include <linux/capability.h>
+ #include <linux/kthread.h>
  #include <asm/div64.h>
  #include "compat.h"
  #include "ctree.h"
@@@ -1282,7 -1283,6 +1283,6 @@@ int btrfs_rm_device(struct btrfs_root *
        bool clear_super = false;
  
        mutex_lock(&uuid_mutex);
-       mutex_lock(&root->fs_info->volume_mutex);
  
        all_avail = root->fs_info->avail_data_alloc_bits |
                root->fs_info->avail_system_alloc_bits |
@@@ -1452,7 -1452,6 +1452,6 @@@ error_close
        if (bdev)
                blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
  out:
-       mutex_unlock(&root->fs_info->volume_mutex);
        mutex_unlock(&uuid_mutex);
        return ret;
  error_undo:
@@@ -1629,7 -1628,6 +1628,6 @@@ int btrfs_init_new_device(struct btrfs_
        }
  
        filemap_write_and_wait(bdev->bd_inode->i_mapping);
-       mutex_lock(&root->fs_info->volume_mutex);
  
        devices = &root->fs_info->fs_devices->devices;
        /*
                ret = btrfs_relocate_sys_chunks(root);
                BUG_ON(ret);
        }
- out:
-       mutex_unlock(&root->fs_info->volume_mutex);
        return ret;
  error:
        blkdev_put(bdev, FMODE_EXCL);
                mutex_unlock(&uuid_mutex);
                up_write(&sb->s_umount);
        }
-       goto out;
+       return ret;
  }
  
  static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
@@@ -2077,6 -2074,362 +2074,362 @@@ error
        return ret;
  }
  
+ static int insert_balance_item(struct btrfs_root *root,
+                              struct btrfs_balance_control *bctl)
+ {
+       struct btrfs_trans_handle *trans;
+       struct btrfs_balance_item *item;
+       struct btrfs_disk_balance_args disk_bargs;
+       struct btrfs_path *path;
+       struct extent_buffer *leaf;
+       struct btrfs_key key;
+       int ret, err;
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+       trans = btrfs_start_transaction(root, 0);
+       if (IS_ERR(trans)) {
+               btrfs_free_path(path);
+               return PTR_ERR(trans);
+       }
+       key.objectid = BTRFS_BALANCE_OBJECTID;
+       key.type = BTRFS_BALANCE_ITEM_KEY;
+       key.offset = 0;
+       ret = btrfs_insert_empty_item(trans, root, path, &key,
+                                     sizeof(*item));
+       if (ret)
+               goto out;
+       leaf = path->nodes[0];
+       item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
+       memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
+       btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
+       btrfs_set_balance_data(leaf, item, &disk_bargs);
+       btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
+       btrfs_set_balance_meta(leaf, item, &disk_bargs);
+       btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
+       btrfs_set_balance_sys(leaf, item, &disk_bargs);
+       btrfs_set_balance_flags(leaf, item, bctl->flags);
+       btrfs_mark_buffer_dirty(leaf);
+ out:
+       btrfs_free_path(path);
+       err = btrfs_commit_transaction(trans, root);
+       if (err && !ret)
+               ret = err;
+       return ret;
+ }
+ static int del_balance_item(struct btrfs_root *root)
+ {
+       struct btrfs_trans_handle *trans;
+       struct btrfs_path *path;
+       struct btrfs_key key;
+       int ret, err;
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+       trans = btrfs_start_transaction(root, 0);
+       if (IS_ERR(trans)) {
+               btrfs_free_path(path);
+               return PTR_ERR(trans);
+       }
+       key.objectid = BTRFS_BALANCE_OBJECTID;
+       key.type = BTRFS_BALANCE_ITEM_KEY;
+       key.offset = 0;
+       ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+       if (ret < 0)
+               goto out;
+       if (ret > 0) {
+               ret = -ENOENT;
+               goto out;
+       }
+       ret = btrfs_del_item(trans, root, path);
+ out:
+       btrfs_free_path(path);
+       err = btrfs_commit_transaction(trans, root);
+       if (err && !ret)
+               ret = err;
+       return ret;
+ }
+ /*
+  * This is a heuristic used to reduce the number of chunks balanced on
+  * resume after balance was interrupted.
+  */
+ static void update_balance_args(struct btrfs_balance_control *bctl)
+ {
+       /*
+        * Turn on soft mode for chunk types that were being converted.
+        */
+       if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
+               bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
+       if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
+               bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
+       if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
+               bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
+       /*
+        * Turn on usage filter if is not already used.  The idea is
+        * that chunks that we have already balanced should be
+        * reasonably full.  Don't do it for chunks that are being
+        * converted - that will keep us from relocating unconverted
+        * (albeit full) chunks.
+        */
+       if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+           !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+               bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
+               bctl->data.usage = 90;
+       }
+       if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+           !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+               bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
+               bctl->sys.usage = 90;
+       }
+       if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+           !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+               bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
+               bctl->meta.usage = 90;
+       }
+ }
+ /*
+  * Should be called with both balance and volume mutexes held to
+  * serialize other volume operations (add_dev/rm_dev/resize) with
+  * restriper.  Same goes for unset_balance_control.
+  */
+ static void set_balance_control(struct btrfs_balance_control *bctl)
+ {
+       struct btrfs_fs_info *fs_info = bctl->fs_info;
+       BUG_ON(fs_info->balance_ctl);
+       spin_lock(&fs_info->balance_lock);
+       fs_info->balance_ctl = bctl;
+       spin_unlock(&fs_info->balance_lock);
+ }
+ static void unset_balance_control(struct btrfs_fs_info *fs_info)
+ {
+       struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+       BUG_ON(!fs_info->balance_ctl);
+       spin_lock(&fs_info->balance_lock);
+       fs_info->balance_ctl = NULL;
+       spin_unlock(&fs_info->balance_lock);
+       kfree(bctl);
+ }
+ /*
+  * Balance filters.  Return 1 if chunk should be filtered out
+  * (should not be balanced).
+  */
+ static int chunk_profiles_filter(u64 chunk_profile,
+                                struct btrfs_balance_args *bargs)
+ {
+       chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
+       if (chunk_profile == 0)
+               chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+       if (bargs->profiles & chunk_profile)
+               return 0;
+       return 1;
+ }
+ static u64 div_factor_fine(u64 num, int factor)
+ {
+       if (factor <= 0)
+               return 0;
+       if (factor >= 100)
+               return num;
+       num *= factor;
+       do_div(num, 100);
+       return num;
+ }
+ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+                             struct btrfs_balance_args *bargs)
+ {
+       struct btrfs_block_group_cache *cache;
+       u64 chunk_used, user_thresh;
+       int ret = 1;
+       cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+       chunk_used = btrfs_block_group_used(&cache->item);
+       user_thresh = div_factor_fine(cache->key.offset, bargs->usage);
+       if (chunk_used < user_thresh)
+               ret = 0;
+       btrfs_put_block_group(cache);
+       return ret;
+ }
+ static int chunk_devid_filter(struct extent_buffer *leaf,
+                             struct btrfs_chunk *chunk,
+                             struct btrfs_balance_args *bargs)
+ {
+       struct btrfs_stripe *stripe;
+       int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+       int i;
+       for (i = 0; i < num_stripes; i++) {
+               stripe = btrfs_stripe_nr(chunk, i);
+               if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
+                       return 0;
+       }
+       return 1;
+ }
+ /* [pstart, pend) */
+ static int chunk_drange_filter(struct extent_buffer *leaf,
+                              struct btrfs_chunk *chunk,
+                              u64 chunk_offset,
+                              struct btrfs_balance_args *bargs)
+ {
+       struct btrfs_stripe *stripe;
+       int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+       u64 stripe_offset;
+       u64 stripe_length;
+       int factor;
+       int i;
+       if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
+               return 0;
+       if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
+            BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))
+               factor = 2;
+       else
+               factor = 1;
+       factor = num_stripes / factor;
+       for (i = 0; i < num_stripes; i++) {
+               stripe = btrfs_stripe_nr(chunk, i);
+               if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
+                       continue;
+               stripe_offset = btrfs_stripe_offset(leaf, stripe);
+               stripe_length = btrfs_chunk_length(leaf, chunk);
+               do_div(stripe_length, factor);
+               if (stripe_offset < bargs->pend &&
+                   stripe_offset + stripe_length > bargs->pstart)
+                       return 0;
+       }
+       return 1;
+ }
+ /* [vstart, vend) */
+ static int chunk_vrange_filter(struct extent_buffer *leaf,
+                              struct btrfs_chunk *chunk,
+                              u64 chunk_offset,
+                              struct btrfs_balance_args *bargs)
+ {
+       if (chunk_offset < bargs->vend &&
+           chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
+               /* at least part of the chunk is inside this vrange */
+               return 0;
+       return 1;
+ }
+ static int chunk_soft_convert_filter(u64 chunk_profile,
+                                    struct btrfs_balance_args *bargs)
+ {
+       if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
+               return 0;
+       chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
+       if (chunk_profile == 0)
+               chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+       if (bargs->target & chunk_profile)
+               return 1;
+       return 0;
+ }
+ static int should_balance_chunk(struct btrfs_root *root,
+                               struct extent_buffer *leaf,
+                               struct btrfs_chunk *chunk, u64 chunk_offset)
+ {
+       struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
+       struct btrfs_balance_args *bargs = NULL;
+       u64 chunk_type = btrfs_chunk_type(leaf, chunk);
+       /* type filter */
+       if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
+             (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
+               return 0;
+       }
+       if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
+               bargs = &bctl->data;
+       else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
+               bargs = &bctl->sys;
+       else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
+               bargs = &bctl->meta;
+       /* profiles filter */
+       if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
+           chunk_profiles_filter(chunk_type, bargs)) {
+               return 0;
+       }
+       /* usage filter */
+       if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
+           chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
+               return 0;
+       }
+       /* devid filter */
+       if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
+           chunk_devid_filter(leaf, chunk, bargs)) {
+               return 0;
+       }
+       /* drange filter, makes sense only with devid filter */
+       if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
+           chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
+               return 0;
+       }
+       /* vrange filter */
+       if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
+           chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
+               return 0;
+       }
+       /* soft profile changing mode */
+       if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
+           chunk_soft_convert_filter(chunk_type, bargs)) {
+               return 0;
+       }
+       return 1;
+ }
  static u64 div_factor(u64 num, int factor)
  {
        if (factor == 10)
        return num;
  }
  
int btrfs_balance(struct btrfs_root *dev_root)
static int __btrfs_balance(struct btrfs_fs_info *fs_info)
  {
-       int ret;
-       struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
+       struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+       struct btrfs_root *chunk_root = fs_info->chunk_root;
+       struct btrfs_root *dev_root = fs_info->dev_root;
+       struct list_head *devices;
        struct btrfs_device *device;
        u64 old_size;
        u64 size_to_free;
+       struct btrfs_chunk *chunk;
        struct btrfs_path *path;
        struct btrfs_key key;
-       struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
-       struct btrfs_trans_handle *trans;
        struct btrfs_key found_key;
-       if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
-               return -EROFS;
-       if (!capable(CAP_SYS_ADMIN))
-               return -EPERM;
-       mutex_lock(&dev_root->fs_info->volume_mutex);
-       dev_root = dev_root->fs_info->dev_root;
+       struct btrfs_trans_handle *trans;
+       struct extent_buffer *leaf;
+       int slot;
+       int ret;
+       int enospc_errors = 0;
+       bool counting = true;
  
        /* step one make some room on all the devices */
+       devices = &fs_info->fs_devices->devices;
        list_for_each_entry(device, devices, dev_list) {
                old_size = device->total_bytes;
                size_to_free = div_factor(old_size, 1);
                ret = -ENOMEM;
                goto error;
        }
+       /* zero out stat counters */
+       spin_lock(&fs_info->balance_lock);
+       memset(&bctl->stat, 0, sizeof(bctl->stat));
+       spin_unlock(&fs_info->balance_lock);
+ again:
        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
        key.offset = (u64)-1;
        key.type = BTRFS_CHUNK_ITEM_KEY;
  
        while (1) {
+               if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
+                   atomic_read(&fs_info->balance_cancel_req)) {
+                       ret = -ECANCELED;
+                       goto error;
+               }
                ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
                if (ret < 0)
                        goto error;
                 * failed
                 */
                if (ret == 0)
-                       break;
+                       BUG(); /* FIXME break ? */
  
                ret = btrfs_previous_item(chunk_root, path, 0,
                                          BTRFS_CHUNK_ITEM_KEY);
-               if (ret)
+               if (ret) {
+                       ret = 0;
                        break;
+               }
+               leaf = path->nodes[0];
+               slot = path->slots[0];
+               btrfs_item_key_to_cpu(leaf, &found_key, slot);
  
-               btrfs_item_key_to_cpu(path->nodes[0], &found_key,
-                                     path->slots[0]);
                if (found_key.objectid != key.objectid)
                        break;
  
                if (found_key.offset == 0)
                        break;
  
+               chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+               if (!counting) {
+                       spin_lock(&fs_info->balance_lock);
+                       bctl->stat.considered++;
+                       spin_unlock(&fs_info->balance_lock);
+               }
+               ret = should_balance_chunk(chunk_root, leaf, chunk,
+                                          found_key.offset);
                btrfs_release_path(path);
+               if (!ret)
+                       goto loop;
+               if (counting) {
+                       spin_lock(&fs_info->balance_lock);
+                       bctl->stat.expected++;
+                       spin_unlock(&fs_info->balance_lock);
+                       goto loop;
+               }
                ret = btrfs_relocate_chunk(chunk_root,
                                           chunk_root->root_key.objectid,
                                           found_key.objectid,
                                           found_key.offset);
                if (ret && ret != -ENOSPC)
                        goto error;
+               if (ret == -ENOSPC) {
+                       enospc_errors++;
+               } else {
+                       spin_lock(&fs_info->balance_lock);
+                       bctl->stat.completed++;
+                       spin_unlock(&fs_info->balance_lock);
+               }
+ loop:
                key.offset = found_key.offset - 1;
        }
-       ret = 0;
+       if (counting) {
+               btrfs_release_path(path);
+               counting = false;
+               goto again;
+       }
  error:
        btrfs_free_path(path);
-       mutex_unlock(&dev_root->fs_info->volume_mutex);
+       if (enospc_errors) {
+               printk(KERN_INFO "btrfs: %d enospc errors during balance\n",
+                      enospc_errors);
+               if (!ret)
+                       ret = -ENOSPC;
+       }
        return ret;
  }
  
+ static inline int balance_need_close(struct btrfs_fs_info *fs_info)
+ {
+       /* cancel requested || normal exit path */
+       return atomic_read(&fs_info->balance_cancel_req) ||
+               (atomic_read(&fs_info->balance_pause_req) == 0 &&
+                atomic_read(&fs_info->balance_cancel_req) == 0);
+ }
+ static void __cancel_balance(struct btrfs_fs_info *fs_info)
+ {
+       int ret;
+       unset_balance_control(fs_info);
+       ret = del_balance_item(fs_info->tree_root);
+       BUG_ON(ret);
+ }
+ void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
+                              struct btrfs_ioctl_balance_args *bargs);
+ /*
+  * Should be called with both balance and volume mutexes held
+  */
+ int btrfs_balance(struct btrfs_balance_control *bctl,
+                 struct btrfs_ioctl_balance_args *bargs)
+ {
+       struct btrfs_fs_info *fs_info = bctl->fs_info;
+       u64 allowed;
+       int ret;
+       if (btrfs_fs_closing(fs_info) ||
+           atomic_read(&fs_info->balance_pause_req) ||
+           atomic_read(&fs_info->balance_cancel_req)) {
+               ret = -EINVAL;
+               goto out;
+       }
+       /*
+        * In case of mixed groups both data and meta should be picked,
+        * and identical options should be given for both of them.
+        */
+       allowed = btrfs_super_incompat_flags(fs_info->super_copy);
+       if ((allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
+           (bctl->flags & (BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA))) {
+               if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
+                   !(bctl->flags & BTRFS_BALANCE_METADATA) ||
+                   memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
+                       printk(KERN_ERR "btrfs: with mixed groups data and "
+                              "metadata balance options must be the same\n");
+                       ret = -EINVAL;
+                       goto out;
+               }
+       }
+       /*
+        * Profile changing sanity checks.  Skip them if a simple
+        * balance is requested.
+        */
+       if (!((bctl->data.flags | bctl->sys.flags | bctl->meta.flags) &
+             BTRFS_BALANCE_ARGS_CONVERT))
+               goto do_balance;
+       allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+       if (fs_info->fs_devices->num_devices == 1)
+               allowed |= BTRFS_BLOCK_GROUP_DUP;
+       else if (fs_info->fs_devices->num_devices < 4)
+               allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
+       else
+               allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
+                               BTRFS_BLOCK_GROUP_RAID10);
+       if (!profile_is_valid(bctl->data.target, 1) ||
+           bctl->data.target & ~allowed) {
+               printk(KERN_ERR "btrfs: unable to start balance with target "
+                      "data profile %llu\n",
+                      (unsigned long long)bctl->data.target);
+               ret = -EINVAL;
+               goto out;
+       }
+       if (!profile_is_valid(bctl->meta.target, 1) ||
+           bctl->meta.target & ~allowed) {
+               printk(KERN_ERR "btrfs: unable to start balance with target "
+                      "metadata profile %llu\n",
+                      (unsigned long long)bctl->meta.target);
+               ret = -EINVAL;
+               goto out;
+       }
+       if (!profile_is_valid(bctl->sys.target, 1) ||
+           bctl->sys.target & ~allowed) {
+               printk(KERN_ERR "btrfs: unable to start balance with target "
+                      "system profile %llu\n",
+                      (unsigned long long)bctl->sys.target);
+               ret = -EINVAL;
+               goto out;
+       }
+       if (bctl->data.target & BTRFS_BLOCK_GROUP_DUP) {
+               printk(KERN_ERR "btrfs: dup for data is not allowed\n");
+               ret = -EINVAL;
+               goto out;
+       }
+       /* allow to reduce meta or sys integrity only if force set */
+       allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
+                       BTRFS_BLOCK_GROUP_RAID10;
+       if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+            (fs_info->avail_system_alloc_bits & allowed) &&
+            !(bctl->sys.target & allowed)) ||
+           ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+            (fs_info->avail_metadata_alloc_bits & allowed) &&
+            !(bctl->meta.target & allowed))) {
+               if (bctl->flags & BTRFS_BALANCE_FORCE) {
+                       printk(KERN_INFO "btrfs: force reducing metadata "
+                              "integrity\n");
+               } else {
+                       printk(KERN_ERR "btrfs: balance will reduce metadata "
+                              "integrity, use force if you want this\n");
+                       ret = -EINVAL;
+                       goto out;
+               }
+       }
+ do_balance:
+       ret = insert_balance_item(fs_info->tree_root, bctl);
+       if (ret && ret != -EEXIST)
+               goto out;
+       if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
+               BUG_ON(ret == -EEXIST);
+               set_balance_control(bctl);
+       } else {
+               BUG_ON(ret != -EEXIST);
+               spin_lock(&fs_info->balance_lock);
+               update_balance_args(bctl);
+               spin_unlock(&fs_info->balance_lock);
+       }
+       atomic_inc(&fs_info->balance_running);
+       mutex_unlock(&fs_info->balance_mutex);
+       ret = __btrfs_balance(fs_info);
+       mutex_lock(&fs_info->balance_mutex);
+       atomic_dec(&fs_info->balance_running);
+       if (bargs) {
+               memset(bargs, 0, sizeof(*bargs));
+               update_ioctl_balance_args(fs_info, 0, bargs);
+       }
+       if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
+           balance_need_close(fs_info)) {
+               __cancel_balance(fs_info);
+       }
+       wake_up(&fs_info->balance_wait_q);
+       return ret;
+ out:
+       if (bctl->flags & BTRFS_BALANCE_RESUME)
+               __cancel_balance(fs_info);
+       else
+               kfree(bctl);
+       return ret;
+ }
+ static int balance_kthread(void *data)
+ {
+       struct btrfs_balance_control *bctl =
+                       (struct btrfs_balance_control *)data;
+       struct btrfs_fs_info *fs_info = bctl->fs_info;
+       int ret = 0;
+       mutex_lock(&fs_info->volume_mutex);
+       mutex_lock(&fs_info->balance_mutex);
+       set_balance_control(bctl);
+       if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
+               printk(KERN_INFO "btrfs: force skipping balance\n");
+       } else {
+               printk(KERN_INFO "btrfs: continuing balance\n");
+               ret = btrfs_balance(bctl, NULL);
+       }
+       mutex_unlock(&fs_info->balance_mutex);
+       mutex_unlock(&fs_info->volume_mutex);
+       return ret;
+ }
+ int btrfs_recover_balance(struct btrfs_root *tree_root)
+ {
+       struct task_struct *tsk;
+       struct btrfs_balance_control *bctl;
+       struct btrfs_balance_item *item;
+       struct btrfs_disk_balance_args disk_bargs;
+       struct btrfs_path *path;
+       struct extent_buffer *leaf;
+       struct btrfs_key key;
+       int ret;
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+       bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+       if (!bctl) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       key.objectid = BTRFS_BALANCE_OBJECTID;
+       key.type = BTRFS_BALANCE_ITEM_KEY;
+       key.offset = 0;
+       ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
+       if (ret < 0)
+               goto out_bctl;
+       if (ret > 0) { /* ret = -ENOENT; */
+               ret = 0;
+               goto out_bctl;
+       }
+       leaf = path->nodes[0];
+       item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
+       bctl->fs_info = tree_root->fs_info;
+       bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME;
+       btrfs_balance_data(leaf, item, &disk_bargs);
+       btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
+       btrfs_balance_meta(leaf, item, &disk_bargs);
+       btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
+       btrfs_balance_sys(leaf, item, &disk_bargs);
+       btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
+       tsk = kthread_run(balance_kthread, bctl, "btrfs-balance");
+       if (IS_ERR(tsk))
+               ret = PTR_ERR(tsk);
+       else
+               goto out;
+ out_bctl:
+       kfree(bctl);
+ out:
+       btrfs_free_path(path);
+       return ret;
+ }
+ int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
+ {
+       int ret = 0;
+       mutex_lock(&fs_info->balance_mutex);
+       if (!fs_info->balance_ctl) {
+               mutex_unlock(&fs_info->balance_mutex);
+               return -ENOTCONN;
+       }
+       if (atomic_read(&fs_info->balance_running)) {
+               atomic_inc(&fs_info->balance_pause_req);
+               mutex_unlock(&fs_info->balance_mutex);
+               wait_event(fs_info->balance_wait_q,
+                          atomic_read(&fs_info->balance_running) == 0);
+               mutex_lock(&fs_info->balance_mutex);
+               /* we are good with balance_ctl ripped off from under us */
+               BUG_ON(atomic_read(&fs_info->balance_running));
+               atomic_dec(&fs_info->balance_pause_req);
+       } else {
+               ret = -ENOTCONN;
+       }
+       mutex_unlock(&fs_info->balance_mutex);
+       return ret;
+ }
+ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
+ {
+       mutex_lock(&fs_info->balance_mutex);
+       if (!fs_info->balance_ctl) {
+               mutex_unlock(&fs_info->balance_mutex);
+               return -ENOTCONN;
+       }
+       atomic_inc(&fs_info->balance_cancel_req);
+       /*
+        * if we are running just wait and return, balance item is
+        * deleted in btrfs_balance in this case
+        */
+       if (atomic_read(&fs_info->balance_running)) {
+               mutex_unlock(&fs_info->balance_mutex);
+               wait_event(fs_info->balance_wait_q,
+                          atomic_read(&fs_info->balance_running) == 0);
+               mutex_lock(&fs_info->balance_mutex);
+       } else {
+               /* __cancel_balance needs volume_mutex */
+               mutex_unlock(&fs_info->balance_mutex);
+               mutex_lock(&fs_info->volume_mutex);
+               mutex_lock(&fs_info->balance_mutex);
+               if (fs_info->balance_ctl)
+                       __cancel_balance(fs_info);
+               mutex_unlock(&fs_info->volume_mutex);
+       }
+       BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
+       atomic_dec(&fs_info->balance_cancel_req);
+       mutex_unlock(&fs_info->balance_mutex);
+       return 0;
+ }
  /*
   * shrinking a device means finding all of the device extents past
   * the new size, and then following the back refs to the chunks.
@@@ -2441,11 -3162,7 +3162,11 @@@ static int __btrfs_alloc_chunk(struct b
                max_stripe_size = 1024 * 1024 * 1024;
                max_chunk_size = 10 * max_stripe_size;
        } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
 -              max_stripe_size = 256 * 1024 * 1024;
 +              /* for larger filesystems, use larger metadata chunks */
 +              if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
 +                      max_stripe_size = 1024 * 1024 * 1024;
 +              else
 +                      max_stripe_size = 256 * 1024 * 1024;
                max_chunk_size = max_stripe_size;
        } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
                max_stripe_size = 8 * 1024 * 1024;
@@@ -2756,8 -3473,7 +3477,7 @@@ static noinline int init_first_rw_devic
                return ret;
  
        alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
-                       (fs_info->metadata_alloc_profile &
-                        fs_info->avail_metadata_alloc_bits);
+                               fs_info->avail_metadata_alloc_bits;
        alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
  
        ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
        sys_chunk_offset = chunk_offset + chunk_size;
  
        alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
-                       (fs_info->system_alloc_profile &
-                        fs_info->avail_system_alloc_bits);
+                               fs_info->avail_system_alloc_bits;
        alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
  
        ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
@@@ -2955,12 -3670,8 +3674,8 @@@ again
                }
        }
        if (rw & REQ_DISCARD) {
-               if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
-                                BTRFS_BLOCK_GROUP_RAID1 |
-                                BTRFS_BLOCK_GROUP_DUP |
-                                BTRFS_BLOCK_GROUP_RAID10)) {
+               if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK)
                        stripes_required = map->num_stripes;
-               }
        }
        if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
            stripes_allocated < stripes_required) {
  
        if (rw & REQ_DISCARD)
                *length = min_t(u64, em->len - offset, *length);
-       else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
-                             BTRFS_BLOCK_GROUP_RAID1 |
-                             BTRFS_BLOCK_GROUP_RAID10 |
-                             BTRFS_BLOCK_GROUP_DUP)) {
+       else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
                /* we limit the length of each bio to what fits in a stripe */
                *length = min_t(u64, em->len - offset,
                                map->stripe_len - stripe_offset);