Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux...
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 17 Jan 2012 23:49:54 +0000 (15:49 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 17 Jan 2012 23:49:54 +0000 (15:49 -0800)
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (62 commits)
  Btrfs: use larger system chunks
  Btrfs: add a delalloc mutex to inodes for delalloc reservations
  Btrfs: space leak tracepoints
  Btrfs: protect orphan block rsv with spin_lock
  Btrfs: add allocator tracepoints
  Btrfs: don't call btrfs_throttle in file write
  Btrfs: release space on error in page_mkwrite
  Btrfs: fix btrfsck error 400 when truncating a compressed
  Btrfs: do not use btrfs_end_transaction_throttle everywhere
  Btrfs: add balance progress reporting
  Btrfs: allow for resuming restriper after it was paused
  Btrfs: allow for canceling restriper
  Btrfs: allow for pausing restriper
  Btrfs: add skip_balance mount option
  Btrfs: recover balance on mount
  Btrfs: save balance parameters to disk
  Btrfs: soft profile changing mode (aka soft convert)
  Btrfs: implement online profile changing
  Btrfs: do not reduce profile in do_chunk_alloc()
  Btrfs: virtual address space subset filter
  ...

Fix up trivial conflict in fs/btrfs/ioctl.c due to the use of the new
mnt_drop_write_file() helper.

34 files changed:
fs/btrfs/Kconfig
fs/btrfs/Makefile
fs/btrfs/backref.c
fs/btrfs/backref.h
fs/btrfs/btrfs_inode.h
fs/btrfs/check-integrity.c [new file with mode: 0644]
fs/btrfs/check-integrity.h [new file with mode: 0644]
fs/btrfs/ctree.c
fs/btrfs/ctree.h
fs/btrfs/delayed-inode.c
fs/btrfs/delayed-ref.c
fs/btrfs/delayed-ref.h
fs/btrfs/disk-io.c
fs/btrfs/extent-tree.c
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h
fs/btrfs/file.c
fs/btrfs/free-space-cache.c
fs/btrfs/inode-map.c
fs/btrfs/inode.c
fs/btrfs/ioctl.c
fs/btrfs/ioctl.h
fs/btrfs/locking.c
fs/btrfs/relocation.c
fs/btrfs/scrub.c
fs/btrfs/super.c
fs/btrfs/transaction.c
fs/btrfs/tree-log.c
fs/btrfs/ulist.c [new file with mode: 0644]
fs/btrfs/ulist.h [new file with mode: 0644]
fs/btrfs/volumes.c
fs/btrfs/volumes.h
fs/btrfs/xattr.c
include/trace/events/btrfs.h

index ecb9fd3be1433838911f4c947627436d816cb136..d33f01c08b60b329247fe4d8729a387af7c263e4 100644 (file)
@@ -31,3 +31,22 @@ config BTRFS_FS_POSIX_ACL
          Linux website <http://acl.bestbits.at/>.
 
          If you don't know what Access Control Lists are, say N
+
+config BTRFS_FS_CHECK_INTEGRITY
+       bool "Btrfs with integrity check tool compiled in (DANGEROUS)"
+       depends on BTRFS_FS
+       help
+         Adds code that examines all block write requests (including
+         writes of the super block). The goal is to verify that the
+         state of the filesystem on disk is always consistent, i.e.,
+         after a power-loss or kernel panic event the filesystem is
+         in a consistent state.
+
+         If the integrity check tool is included and activated in
+         the mount options, plenty of kernel memory is used, and
+         plenty of additional CPU cycles are spent. Enabling this
+         functionality is not intended for normal use.
+
+         In most cases, unless you are a btrfs developer who needs
+         to verify the integrity of (super)-block write requests
+         during the run of a regression test, say N
index c0ddfd29c5e5a348464d5c3d15a77a7708fd8d79..0c4fa2befae793f1a6845322d7ba71aaa5da4374 100644 (file)
@@ -8,6 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
           extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
           export.o tree-log.o free-space-cache.o zlib.o lzo.o \
           compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
-          reada.o backref.o
+          reada.o backref.o ulist.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
+btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
index 22c64fff1bd524b213ce8b13a233f861680d8424..b9a843226de859c34c8e17f9172aeab0ff0f3be1 100644 (file)
 #include "ctree.h"
 #include "disk-io.h"
 #include "backref.h"
+#include "ulist.h"
+#include "transaction.h"
+#include "delayed-ref.h"
 
-struct __data_ref {
+/*
+ * this structure records all encountered refs on the way up to the root
+ */
+struct __prelim_ref {
        struct list_head list;
-       u64 inum;
-       u64 root;
-       u64 extent_data_item_offset;
+       u64 root_id;
+       struct btrfs_key key;
+       int level;
+       int count;
+       u64 parent;
+       u64 wanted_disk_byte;
 };
 
-struct __shared_ref {
-       struct list_head list;
+static int __add_prelim_ref(struct list_head *head, u64 root_id,
+                           struct btrfs_key *key, int level, u64 parent,
+                           u64 wanted_disk_byte, int count)
+{
+       struct __prelim_ref *ref;
+
+       /* in case we're adding delayed refs, we're holding the refs spinlock */
+       ref = kmalloc(sizeof(*ref), GFP_ATOMIC);
+       if (!ref)
+               return -ENOMEM;
+
+       ref->root_id = root_id;
+       if (key)
+               ref->key = *key;
+       else
+               memset(&ref->key, 0, sizeof(ref->key));
+
+       ref->level = level;
+       ref->count = count;
+       ref->parent = parent;
+       ref->wanted_disk_byte = wanted_disk_byte;
+       list_add_tail(&ref->list, head);
+
+       return 0;
+}
+
+static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
+                               struct ulist *parents,
+                               struct extent_buffer *eb, int level,
+                               u64 wanted_objectid, u64 wanted_disk_byte)
+{
+       int ret;
+       int slot;
+       struct btrfs_file_extent_item *fi;
+       struct btrfs_key key;
        u64 disk_byte;
-};
+
+add_parent:
+       ret = ulist_add(parents, eb->start, 0, GFP_NOFS);
+       if (ret < 0)
+               return ret;
+
+       if (level != 0)
+               return 0;
+
+       /*
+        * if the current leaf is full with EXTENT_DATA items, we must
+        * check the next one if that holds a reference as well.
+        * ref->count cannot be used to skip this check.
+        * repeat this until we don't find any additional EXTENT_DATA items.
+        */
+       while (1) {
+               ret = btrfs_next_leaf(root, path);
+               if (ret < 0)
+                       return ret;
+               if (ret)
+                       return 0;
+
+               eb = path->nodes[0];
+               for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) {
+                       btrfs_item_key_to_cpu(eb, &key, slot);
+                       if (key.objectid != wanted_objectid ||
+                           key.type != BTRFS_EXTENT_DATA_KEY)
+                               return 0;
+                       fi = btrfs_item_ptr(eb, slot,
+                                               struct btrfs_file_extent_item);
+                       disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+                       if (disk_byte == wanted_disk_byte)
+                               goto add_parent;
+               }
+       }
+
+       return 0;
+}
+
+/*
+ * resolve an indirect backref in the form (root_id, key, level)
+ * to a logical address
+ */
+static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
+                                       struct __prelim_ref *ref,
+                                       struct ulist *parents)
+{
+       struct btrfs_path *path;
+       struct btrfs_root *root;
+       struct btrfs_key root_key;
+       struct btrfs_key key = {0};
+       struct extent_buffer *eb;
+       int ret = 0;
+       int root_level;
+       int level = ref->level;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       root_key.objectid = ref->root_id;
+       root_key.type = BTRFS_ROOT_ITEM_KEY;
+       root_key.offset = (u64)-1;
+       root = btrfs_read_fs_root_no_name(fs_info, &root_key);
+       if (IS_ERR(root)) {
+               ret = PTR_ERR(root);
+               goto out;
+       }
+
+       rcu_read_lock();
+       root_level = btrfs_header_level(root->node);
+       rcu_read_unlock();
+
+       if (root_level + 1 == level)
+               goto out;
+
+       path->lowest_level = level;
+       ret = btrfs_search_slot(NULL, root, &ref->key, path, 0, 0);
+       pr_debug("search slot in root %llu (level %d, ref count %d) returned "
+                "%d for key (%llu %u %llu)\n",
+                (unsigned long long)ref->root_id, level, ref->count, ret,
+                (unsigned long long)ref->key.objectid, ref->key.type,
+                (unsigned long long)ref->key.offset);
+       if (ret < 0)
+               goto out;
+
+       eb = path->nodes[level];
+       if (!eb) {
+               WARN_ON(1);
+               ret = 1;
+               goto out;
+       }
+
+       if (level == 0) {
+               if (ret == 1 && path->slots[0] >= btrfs_header_nritems(eb)) {
+                       ret = btrfs_next_leaf(root, path);
+                       if (ret)
+                               goto out;
+                       eb = path->nodes[0];
+               }
+
+               btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
+       }
+
+       /* the last two parameters will only be used for level == 0 */
+       ret = add_all_parents(root, path, parents, eb, level, key.objectid,
+                               ref->wanted_disk_byte);
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
+/*
+ * resolve all indirect backrefs from the list
+ */
+static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
+                                  struct list_head *head)
+{
+       int err;
+       int ret = 0;
+       struct __prelim_ref *ref;
+       struct __prelim_ref *ref_safe;
+       struct __prelim_ref *new_ref;
+       struct ulist *parents;
+       struct ulist_node *node;
+
+       parents = ulist_alloc(GFP_NOFS);
+       if (!parents)
+               return -ENOMEM;
+
+       /*
+        * _safe allows us to insert directly after the current item without
+        * iterating over the newly inserted items.
+        * we're also allowed to re-assign ref during iteration.
+        */
+       list_for_each_entry_safe(ref, ref_safe, head, list) {
+               if (ref->parent)        /* already direct */
+                       continue;
+               if (ref->count == 0)
+                       continue;
+               err = __resolve_indirect_ref(fs_info, ref, parents);
+               if (err) {
+                       if (ret == 0)
+                               ret = err;
+                       continue;
+               }
+
+               /* we put the first parent into the ref at hand */
+               node = ulist_next(parents, NULL);
+               ref->parent = node ? node->val : 0;
+
+               /* additional parents require new refs being added here */
+               while ((node = ulist_next(parents, node))) {
+                       new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS);
+                       if (!new_ref) {
+                               ret = -ENOMEM;
+                               break;
+                       }
+                       memcpy(new_ref, ref, sizeof(*ref));
+                       new_ref->parent = node->val;
+                       list_add(&new_ref->list, &ref->list);
+               }
+               ulist_reinit(parents);
+       }
+
+       ulist_free(parents);
+       return ret;
+}
+
+/*
+ * merge two lists of backrefs and adjust counts accordingly
+ *
+ * mode = 1: merge identical keys, if key is set
+ * mode = 2: merge identical parents
+ */
+static int __merge_refs(struct list_head *head, int mode)
+{
+       struct list_head *pos1;
+
+       list_for_each(pos1, head) {
+               struct list_head *n2;
+               struct list_head *pos2;
+               struct __prelim_ref *ref1;
+
+               ref1 = list_entry(pos1, struct __prelim_ref, list);
+
+               if (mode == 1 && ref1->key.type == 0)
+                       continue;
+               for (pos2 = pos1->next, n2 = pos2->next; pos2 != head;
+                    pos2 = n2, n2 = pos2->next) {
+                       struct __prelim_ref *ref2;
+
+                       ref2 = list_entry(pos2, struct __prelim_ref, list);
+
+                       if (mode == 1) {
+                               if (memcmp(&ref1->key, &ref2->key,
+                                          sizeof(ref1->key)) ||
+                                   ref1->level != ref2->level ||
+                                   ref1->root_id != ref2->root_id)
+                                       continue;
+                               ref1->count += ref2->count;
+                       } else {
+                               if (ref1->parent != ref2->parent)
+                                       continue;
+                               ref1->count += ref2->count;
+                       }
+                       list_del(&ref2->list);
+                       kfree(ref2);
+               }
+
+       }
+       return 0;
+}
+
+/*
+ * add all currently queued delayed refs from this head whose seq nr is
+ * smaller or equal that seq to the list
+ */
+static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
+                             struct btrfs_key *info_key,
+                             struct list_head *prefs)
+{
+       struct btrfs_delayed_extent_op *extent_op = head->extent_op;
+       struct rb_node *n = &head->node.rb_node;
+       int sgn;
+       int ret;
+
+       if (extent_op && extent_op->update_key)
+               btrfs_disk_key_to_cpu(info_key, &extent_op->key);
+
+       while ((n = rb_prev(n))) {
+               struct btrfs_delayed_ref_node *node;
+               node = rb_entry(n, struct btrfs_delayed_ref_node,
+                               rb_node);
+               if (node->bytenr != head->node.bytenr)
+                       break;
+               WARN_ON(node->is_head);
+
+               if (node->seq > seq)
+                       continue;
+
+               switch (node->action) {
+               case BTRFS_ADD_DELAYED_EXTENT:
+               case BTRFS_UPDATE_DELAYED_HEAD:
+                       WARN_ON(1);
+                       continue;
+               case BTRFS_ADD_DELAYED_REF:
+                       sgn = 1;
+                       break;
+               case BTRFS_DROP_DELAYED_REF:
+                       sgn = -1;
+                       break;
+               default:
+                       BUG_ON(1);
+               }
+               switch (node->type) {
+               case BTRFS_TREE_BLOCK_REF_KEY: {
+                       struct btrfs_delayed_tree_ref *ref;
+
+                       ref = btrfs_delayed_node_to_tree_ref(node);
+                       ret = __add_prelim_ref(prefs, ref->root, info_key,
+                                              ref->level + 1, 0, node->bytenr,
+                                              node->ref_mod * sgn);
+                       break;
+               }
+               case BTRFS_SHARED_BLOCK_REF_KEY: {
+                       struct btrfs_delayed_tree_ref *ref;
+
+                       ref = btrfs_delayed_node_to_tree_ref(node);
+                       ret = __add_prelim_ref(prefs, ref->root, info_key,
+                                              ref->level + 1, ref->parent,
+                                              node->bytenr,
+                                              node->ref_mod * sgn);
+                       break;
+               }
+               case BTRFS_EXTENT_DATA_REF_KEY: {
+                       struct btrfs_delayed_data_ref *ref;
+                       struct btrfs_key key;
+
+                       ref = btrfs_delayed_node_to_data_ref(node);
+
+                       key.objectid = ref->objectid;
+                       key.type = BTRFS_EXTENT_DATA_KEY;
+                       key.offset = ref->offset;
+                       ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0,
+                                              node->bytenr,
+                                              node->ref_mod * sgn);
+                       break;
+               }
+               case BTRFS_SHARED_DATA_REF_KEY: {
+                       struct btrfs_delayed_data_ref *ref;
+                       struct btrfs_key key;
+
+                       ref = btrfs_delayed_node_to_data_ref(node);
+
+                       key.objectid = ref->objectid;
+                       key.type = BTRFS_EXTENT_DATA_KEY;
+                       key.offset = ref->offset;
+                       ret = __add_prelim_ref(prefs, ref->root, &key, 0,
+                                              ref->parent, node->bytenr,
+                                              node->ref_mod * sgn);
+                       break;
+               }
+               default:
+                       WARN_ON(1);
+               }
+               BUG_ON(ret);
+       }
+
+       return 0;
+}
+
+/*
+ * add all inline backrefs for bytenr to the list
+ */
+static int __add_inline_refs(struct btrfs_fs_info *fs_info,
+                            struct btrfs_path *path, u64 bytenr,
+                            struct btrfs_key *info_key, int *info_level,
+                            struct list_head *prefs)
+{
+       int ret;
+       int slot;
+       struct extent_buffer *leaf;
+       struct btrfs_key key;
+       unsigned long ptr;
+       unsigned long end;
+       struct btrfs_extent_item *ei;
+       u64 flags;
+       u64 item_size;
+
+       /*
+        * enumerate all inline refs
+        */
+       leaf = path->nodes[0];
+       slot = path->slots[0] - 1;
+
+       item_size = btrfs_item_size_nr(leaf, slot);
+       BUG_ON(item_size < sizeof(*ei));
+
+       ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+       flags = btrfs_extent_flags(leaf, ei);
+
+       ptr = (unsigned long)(ei + 1);
+       end = (unsigned long)ei + item_size;
+
+       if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+               struct btrfs_tree_block_info *info;
+               struct btrfs_disk_key disk_key;
+
+               info = (struct btrfs_tree_block_info *)ptr;
+               *info_level = btrfs_tree_block_level(leaf, info);
+               btrfs_tree_block_key(leaf, info, &disk_key);
+               btrfs_disk_key_to_cpu(info_key, &disk_key);
+               ptr += sizeof(struct btrfs_tree_block_info);
+               BUG_ON(ptr > end);
+       } else {
+               BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
+       }
+
+       while (ptr < end) {
+               struct btrfs_extent_inline_ref *iref;
+               u64 offset;
+               int type;
+
+               iref = (struct btrfs_extent_inline_ref *)ptr;
+               type = btrfs_extent_inline_ref_type(leaf, iref);
+               offset = btrfs_extent_inline_ref_offset(leaf, iref);
+
+               switch (type) {
+               case BTRFS_SHARED_BLOCK_REF_KEY:
+                       ret = __add_prelim_ref(prefs, 0, info_key,
+                                               *info_level + 1, offset,
+                                               bytenr, 1);
+                       break;
+               case BTRFS_SHARED_DATA_REF_KEY: {
+                       struct btrfs_shared_data_ref *sdref;
+                       int count;
+
+                       sdref = (struct btrfs_shared_data_ref *)(iref + 1);
+                       count = btrfs_shared_data_ref_count(leaf, sdref);
+                       ret = __add_prelim_ref(prefs, 0, NULL, 0, offset,
+                                              bytenr, count);
+                       break;
+               }
+               case BTRFS_TREE_BLOCK_REF_KEY:
+                       ret = __add_prelim_ref(prefs, offset, info_key,
+                                              *info_level + 1, 0, bytenr, 1);
+                       break;
+               case BTRFS_EXTENT_DATA_REF_KEY: {
+                       struct btrfs_extent_data_ref *dref;
+                       int count;
+                       u64 root;
+
+                       dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+                       count = btrfs_extent_data_ref_count(leaf, dref);
+                       key.objectid = btrfs_extent_data_ref_objectid(leaf,
+                                                                     dref);
+                       key.type = BTRFS_EXTENT_DATA_KEY;
+                       key.offset = btrfs_extent_data_ref_offset(leaf, dref);
+                       root = btrfs_extent_data_ref_root(leaf, dref);
+                       ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr,
+                                               count);
+                       break;
+               }
+               default:
+                       WARN_ON(1);
+               }
+               BUG_ON(ret);
+               ptr += btrfs_extent_inline_ref_size(type);
+       }
+
+       return 0;
+}
+
+/*
+ * add all non-inline backrefs for bytenr to the list
+ */
+static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
+                           struct btrfs_path *path, u64 bytenr,
+                           struct btrfs_key *info_key, int info_level,
+                           struct list_head *prefs)
+{
+       struct btrfs_root *extent_root = fs_info->extent_root;
+       int ret;
+       int slot;
+       struct extent_buffer *leaf;
+       struct btrfs_key key;
+
+       while (1) {
+               ret = btrfs_next_item(extent_root, path);
+               if (ret < 0)
+                       break;
+               if (ret) {
+                       ret = 0;
+                       break;
+               }
+
+               slot = path->slots[0];
+               leaf = path->nodes[0];
+               btrfs_item_key_to_cpu(leaf, &key, slot);
+
+               if (key.objectid != bytenr)
+                       break;
+               if (key.type < BTRFS_TREE_BLOCK_REF_KEY)
+                       continue;
+               if (key.type > BTRFS_SHARED_DATA_REF_KEY)
+                       break;
+
+               switch (key.type) {
+               case BTRFS_SHARED_BLOCK_REF_KEY:
+                       ret = __add_prelim_ref(prefs, 0, info_key,
+                                               info_level + 1, key.offset,
+                                               bytenr, 1);
+                       break;
+               case BTRFS_SHARED_DATA_REF_KEY: {
+                       struct btrfs_shared_data_ref *sdref;
+                       int count;
+
+                       sdref = btrfs_item_ptr(leaf, slot,
+                                             struct btrfs_shared_data_ref);
+                       count = btrfs_shared_data_ref_count(leaf, sdref);
+                       ret = __add_prelim_ref(prefs, 0, NULL, 0, key.offset,
+                                               bytenr, count);
+                       break;
+               }
+               case BTRFS_TREE_BLOCK_REF_KEY:
+                       ret = __add_prelim_ref(prefs, key.offset, info_key,
+                                               info_level + 1, 0, bytenr, 1);
+                       break;
+               case BTRFS_EXTENT_DATA_REF_KEY: {
+                       struct btrfs_extent_data_ref *dref;
+                       int count;
+                       u64 root;
+
+                       dref = btrfs_item_ptr(leaf, slot,
+                                             struct btrfs_extent_data_ref);
+                       count = btrfs_extent_data_ref_count(leaf, dref);
+                       key.objectid = btrfs_extent_data_ref_objectid(leaf,
+                                                                     dref);
+                       key.type = BTRFS_EXTENT_DATA_KEY;
+                       key.offset = btrfs_extent_data_ref_offset(leaf, dref);
+                       root = btrfs_extent_data_ref_root(leaf, dref);
+                       ret = __add_prelim_ref(prefs, root, &key, 0, 0,
+                                               bytenr, count);
+                       break;
+               }
+               default:
+                       WARN_ON(1);
+               }
+               BUG_ON(ret);
+       }
+
+       return ret;
+}
+
+/*
+ * this adds all existing backrefs (inline backrefs, backrefs and delayed
+ * refs) for the given bytenr to the refs list, merges duplicates and resolves
+ * indirect refs to their parent bytenr.
+ * When roots are found, they're added to the roots list
+ *
+ * FIXME some caching might speed things up
+ */
+static int find_parent_nodes(struct btrfs_trans_handle *trans,
+                            struct btrfs_fs_info *fs_info, u64 bytenr,
+                            u64 seq, struct ulist *refs, struct ulist *roots)
+{
+       struct btrfs_key key;
+       struct btrfs_path *path;
+       struct btrfs_key info_key = { 0 };
+       struct btrfs_delayed_ref_root *delayed_refs = NULL;
+       struct btrfs_delayed_ref_head *head = NULL;
+       int info_level = 0;
+       int ret;
+       struct list_head prefs_delayed;
+       struct list_head prefs;
+       struct __prelim_ref *ref;
+
+       INIT_LIST_HEAD(&prefs);
+       INIT_LIST_HEAD(&prefs_delayed);
+
+       key.objectid = bytenr;
+       key.type = BTRFS_EXTENT_ITEM_KEY;
+       key.offset = (u64)-1;
+
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+
+       /*
+        * grab both a lock on the path and a lock on the delayed ref head.
+        * We need both to get a consistent picture of how the refs look
+        * at a specified point in time
+        */
+again:
+       ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
+       if (ret < 0)
+               goto out;
+       BUG_ON(ret == 0);
+
+       /*
+        * look if there are updates for this ref queued and lock the head
+        */
+       delayed_refs = &trans->transaction->delayed_refs;
+       spin_lock(&delayed_refs->lock);
+       head = btrfs_find_delayed_ref_head(trans, bytenr);
+       if (head) {
+               if (!mutex_trylock(&head->mutex)) {
+                       atomic_inc(&head->node.refs);
+                       spin_unlock(&delayed_refs->lock);
+
+                       btrfs_release_path(path);
+
+                       /*
+                        * Mutex was contended, block until it's
+                        * released and try again
+                        */
+                       mutex_lock(&head->mutex);
+                       mutex_unlock(&head->mutex);
+                       btrfs_put_delayed_ref(&head->node);
+                       goto again;
+               }
+               ret = __add_delayed_refs(head, seq, &info_key, &prefs_delayed);
+               if (ret)
+                       goto out;
+       }
+       spin_unlock(&delayed_refs->lock);
+
+       if (path->slots[0]) {
+               struct extent_buffer *leaf;
+               int slot;
+
+               leaf = path->nodes[0];
+               slot = path->slots[0] - 1;
+               btrfs_item_key_to_cpu(leaf, &key, slot);
+               if (key.objectid == bytenr &&
+                   key.type == BTRFS_EXTENT_ITEM_KEY) {
+                       ret = __add_inline_refs(fs_info, path, bytenr,
+                                               &info_key, &info_level, &prefs);
+                       if (ret)
+                               goto out;
+                       ret = __add_keyed_refs(fs_info, path, bytenr, &info_key,
+                                              info_level, &prefs);
+                       if (ret)
+                               goto out;
+               }
+       }
+       btrfs_release_path(path);
+
+       /*
+        * when adding the delayed refs above, the info_key might not have
+        * been known yet. Go over the list and replace the missing keys
+        */
+       list_for_each_entry(ref, &prefs_delayed, list) {
+               if ((ref->key.offset | ref->key.type | ref->key.objectid) == 0)
+                       memcpy(&ref->key, &info_key, sizeof(ref->key));
+       }
+       list_splice_init(&prefs_delayed, &prefs);
+
+       ret = __merge_refs(&prefs, 1);
+       if (ret)
+               goto out;
+
+       ret = __resolve_indirect_refs(fs_info, &prefs);
+       if (ret)
+               goto out;
+
+       ret = __merge_refs(&prefs, 2);
+       if (ret)
+               goto out;
+
+       while (!list_empty(&prefs)) {
+               ref = list_first_entry(&prefs, struct __prelim_ref, list);
+               list_del(&ref->list);
+               if (ref->count < 0)
+                       WARN_ON(1);
+               if (ref->count && ref->root_id && ref->parent == 0) {
+                       /* no parent == root of tree */
+                       ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
+                       BUG_ON(ret < 0);
+               }
+               if (ref->count && ref->parent) {
+                       ret = ulist_add(refs, ref->parent, 0, GFP_NOFS);
+                       BUG_ON(ret < 0);
+               }
+               kfree(ref);
+       }
+
+out:
+       if (head)
+               mutex_unlock(&head->mutex);
+       btrfs_free_path(path);
+       while (!list_empty(&prefs)) {
+               ref = list_first_entry(&prefs, struct __prelim_ref, list);
+               list_del(&ref->list);
+               kfree(ref);
+       }
+       while (!list_empty(&prefs_delayed)) {
+               ref = list_first_entry(&prefs_delayed, struct __prelim_ref,
+                                      list);
+               list_del(&ref->list);
+               kfree(ref);
+       }
+
+       return ret;
+}
+
+/*
+ * Finds all leafs with a reference to the specified combination of bytenr and
+ * offset. key_list_head will point to a list of corresponding keys (caller must
+ * free each list element). The leafs will be stored in the leafs ulist, which
+ * must be freed with ulist_free.
+ *
+ * returns 0 on success, <0 on error
+ */
+static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
+                               struct btrfs_fs_info *fs_info, u64 bytenr,
+                               u64 num_bytes, u64 seq, struct ulist **leafs)
+{
+       struct ulist *tmp;
+       int ret;
+
+       tmp = ulist_alloc(GFP_NOFS);
+       if (!tmp)
+               return -ENOMEM;
+       *leafs = ulist_alloc(GFP_NOFS);
+       if (!*leafs) {
+               ulist_free(tmp);
+               return -ENOMEM;
+       }
+
+       ret = find_parent_nodes(trans, fs_info, bytenr, seq, *leafs, tmp);
+       ulist_free(tmp);
+
+       if (ret < 0 && ret != -ENOENT) {
+               ulist_free(*leafs);
+               return ret;
+       }
+
+       return 0;
+}
+
+/*
+ * walk all backrefs for a given extent to find all roots that reference this
+ * extent. Walking a backref means finding all extents that reference this
+ * extent and in turn walk the backrefs of those, too. Naturally this is a
+ * recursive process, but here it is implemented in an iterative fashion: We
+ * find all referencing extents for the extent in question and put them on a
+ * list. In turn, we find all referencing extents for those, further appending
+ * to the list. The way we iterate the list allows adding more elements after
+ * the current while iterating. The process stops when we reach the end of the
+ * list. Found roots are added to the roots list.
+ *
+ * returns 0 on success, < 0 on error.
+ */
+int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+                               struct btrfs_fs_info *fs_info, u64 bytenr,
+                               u64 num_bytes, u64 seq, struct ulist **roots)
+{
+       struct ulist *tmp;
+       struct ulist_node *node = NULL;
+       int ret;
+
+       tmp = ulist_alloc(GFP_NOFS);
+       if (!tmp)
+               return -ENOMEM;
+       *roots = ulist_alloc(GFP_NOFS);
+       if (!*roots) {
+               ulist_free(tmp);
+               return -ENOMEM;
+       }
+
+       while (1) {
+               ret = find_parent_nodes(trans, fs_info, bytenr, seq,
+                                       tmp, *roots);
+               if (ret < 0 && ret != -ENOENT) {
+                       ulist_free(tmp);
+                       ulist_free(*roots);
+                       return ret;
+               }
+               node = ulist_next(tmp, node);
+               if (!node)
+                       break;
+               bytenr = node->val;
+       }
+
+       ulist_free(tmp);
+       return 0;
+}
+
 
 static int __inode_info(u64 inum, u64 ioff, u8 key_type,
                        struct btrfs_root *fs_root, struct btrfs_path *path,
@@ -181,8 +952,11 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
        btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
        if (found_key->type != BTRFS_EXTENT_ITEM_KEY ||
            found_key->objectid > logical ||
-           found_key->objectid + found_key->offset <= logical)
+           found_key->objectid + found_key->offset <= logical) {
+               pr_debug("logical %llu is not within any extent\n",
+                        (unsigned long long)logical);
                return -ENOENT;
+       }
 
        eb = path->nodes[0];
        item_size = btrfs_item_size_nr(eb, path->slots[0]);
@@ -191,6 +965,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
        ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
        flags = btrfs_extent_flags(eb, ei);
 
+       pr_debug("logical %llu is at position %llu within the extent (%llu "
+                "EXTENT_ITEM %llu) flags %#llx size %u\n",
+                (unsigned long long)logical,
+                (unsigned long long)(logical - found_key->objectid),
+                (unsigned long long)found_key->objectid,
+                (unsigned long long)found_key->offset,
+                (unsigned long long)flags, item_size);
        if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
                return BTRFS_EXTENT_FLAG_TREE_BLOCK;
        if (flags & BTRFS_EXTENT_FLAG_DATA)
@@ -287,128 +1068,11 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
        return 0;
 }
 
-static int __data_list_add(struct list_head *head, u64 inum,
-                               u64 extent_data_item_offset, u64 root)
-{
-       struct __data_ref *ref;
-
-       ref = kmalloc(sizeof(*ref), GFP_NOFS);
-       if (!ref)
-               return -ENOMEM;
-
-       ref->inum = inum;
-       ref->extent_data_item_offset = extent_data_item_offset;
-       ref->root = root;
-       list_add_tail(&ref->list, head);
-
-       return 0;
-}
-
-static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb,
-                               struct btrfs_extent_data_ref *dref)
-{
-       return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref),
-                               btrfs_extent_data_ref_offset(eb, dref),
-                               btrfs_extent_data_ref_root(eb, dref));
-}
-
-static int __shared_list_add(struct list_head *head, u64 disk_byte)
-{
-       struct __shared_ref *ref;
-
-       ref = kmalloc(sizeof(*ref), GFP_NOFS);
-       if (!ref)
-               return -ENOMEM;
-
-       ref->disk_byte = disk_byte;
-       list_add_tail(&ref->list, head);
-
-       return 0;
-}
-
-static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info,
-                                          u64 logical, u64 inum,
-                                          u64 extent_data_item_offset,
-                                          u64 extent_offset,
-                                          struct btrfs_path *path,
-                                          struct list_head *data_refs,
-                                          iterate_extent_inodes_t *iterate,
-                                          void *ctx)
-{
-       u64 ref_root;
-       u32 item_size;
-       struct btrfs_key key;
-       struct extent_buffer *eb;
-       struct btrfs_extent_item *ei;
-       struct btrfs_extent_inline_ref *eiref;
-       struct __data_ref *ref;
-       int ret;
-       int type;
-       int last;
-       unsigned long ptr = 0;
-
-       WARN_ON(!list_empty(data_refs));
-       ret = extent_from_logical(fs_info, logical, path, &key);
-       if (ret & BTRFS_EXTENT_FLAG_DATA)
-               ret = -EIO;
-       if (ret < 0)
-               goto out;
-
-       eb = path->nodes[0];
-       ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
-       item_size = btrfs_item_size_nr(eb, path->slots[0]);
-
-       ret = 0;
-       ref_root = 0;
-       /*
-        * as done in iterate_extent_inodes, we first build a list of refs to
-        * iterate, then free the path and then iterate them to avoid deadlocks.
-        */
-       do {
-               last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
-                                               &eiref, &type);
-               if (last < 0) {
-                       ret = last;
-                       goto out;
-               }
-               if (type == BTRFS_TREE_BLOCK_REF_KEY ||
-                   type == BTRFS_SHARED_BLOCK_REF_KEY) {
-                       ref_root = btrfs_extent_inline_ref_offset(eb, eiref);
-                       ret = __data_list_add(data_refs, inum,
-                                               extent_data_item_offset,
-                                               ref_root);
-               }
-       } while (!ret && !last);
-
-       btrfs_release_path(path);
-
-       if (ref_root == 0) {
-               printk(KERN_ERR "btrfs: failed to find tree block ref "
-                       "for shared data backref %llu\n", logical);
-               WARN_ON(1);
-               ret = -EIO;
-       }
-
-out:
-       while (!list_empty(data_refs)) {
-               ref = list_first_entry(data_refs, struct __data_ref, list);
-               list_del(&ref->list);
-               if (!ret)
-                       ret = iterate(ref->inum, extent_offset +
-                                       ref->extent_data_item_offset,
-                                       ref->root, ctx);
-               kfree(ref);
-       }
-
-       return ret;
-}
-
-static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
-                                   u64 logical, u64 orig_extent_item_objectid,
-                                   u64 extent_offset, struct btrfs_path *path,
-                                   struct list_head *data_refs,
-                                   iterate_extent_inodes_t *iterate,
-                                   void *ctx)
+static int iterate_leaf_refs(struct btrfs_fs_info *fs_info,
+                               struct btrfs_path *path, u64 logical,
+                               u64 orig_extent_item_objectid,
+                               u64 extent_item_pos, u64 root,
+                               iterate_extent_inodes_t *iterate, void *ctx)
 {
        u64 disk_byte;
        struct btrfs_key key;
@@ -416,8 +1080,10 @@ static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
        struct extent_buffer *eb;
        int slot;
        int nritems;
-       int ret;
-       int found = 0;
+       int ret = 0;
+       int extent_type;
+       u64 data_offset;
+       u64 data_len;
 
        eb = read_tree_block(fs_info->tree_root, logical,
                                fs_info->tree_root->leafsize, 0);
@@ -435,149 +1101,99 @@ static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
                if (key.type != BTRFS_EXTENT_DATA_KEY)
                        continue;
                fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
-               if (!fi) {
-                       free_extent_buffer(eb);
-                       return -EIO;
-               }
+               extent_type = btrfs_file_extent_type(eb, fi);
+               if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+                       continue;
+               /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
                disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
-               if (disk_byte != orig_extent_item_objectid) {
-                       if (found)
-                               break;
-                       else
-                               continue;
-               }
-               ++found;
-               ret = __iter_shared_inline_ref_inodes(fs_info, logical,
-                                                       key.objectid,
-                                                       key.offset,
-                                                       extent_offset, path,
-                                                       data_refs,
-                                                       iterate, ctx);
-               if (ret)
-                       break;
-       }
+               if (disk_byte != orig_extent_item_objectid)
+                       continue;
 
-       if (!found) {
-               printk(KERN_ERR "btrfs: failed to follow shared data backref "
-                       "to parent %llu\n", logical);
-               WARN_ON(1);
-               ret = -EIO;
+               data_offset = btrfs_file_extent_offset(eb, fi);
+               data_len = btrfs_file_extent_num_bytes(eb, fi);
+
+               if (extent_item_pos < data_offset ||
+                   extent_item_pos >= data_offset + data_len)
+                       continue;
+
+               pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), "
+                               "root %llu\n", orig_extent_item_objectid,
+                               key.objectid, key.offset, root);
+               ret = iterate(key.objectid,
+                               key.offset + (extent_item_pos - data_offset),
+                               root, ctx);
+               if (ret) {
+                       pr_debug("stopping iteration because ret=%d\n", ret);
+                       break;
+               }
        }
 
        free_extent_buffer(eb);
+
        return ret;
 }
 
 /*
  * calls iterate() for every inode that references the extent identified by
- * the given parameters. will use the path given as a parameter and return it
- * released.
+ * the given parameters.
  * when the iterator function returns a non-zero value, iteration stops.
+ * path is guaranteed to be in released state when iterate() is called.
  */
 int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
                                struct btrfs_path *path,
-                               u64 extent_item_objectid,
-                               u64 extent_offset,
+                               u64 extent_item_objectid, u64 extent_item_pos,
                                iterate_extent_inodes_t *iterate, void *ctx)
 {
-       unsigned long ptr = 0;
-       int last;
        int ret;
-       int type;
-       u64 logical;
-       u32 item_size;
-       struct btrfs_extent_inline_ref *eiref;
-       struct btrfs_extent_data_ref *dref;
-       struct extent_buffer *eb;
-       struct btrfs_extent_item *ei;
-       struct btrfs_key key;
        struct list_head data_refs = LIST_HEAD_INIT(data_refs);
        struct list_head shared_refs = LIST_HEAD_INIT(shared_refs);
-       struct __data_ref *ref_d;
-       struct __shared_ref *ref_s;
-
-       eb = path->nodes[0];
-       ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
-       item_size = btrfs_item_size_nr(eb, path->slots[0]);
-
-       /* first we iterate the inline refs, ... */
-       do {
-               last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
-                                               &eiref, &type);
-               if (last == -ENOENT) {
-                       ret = 0;
-                       break;
-               }
-               if (last < 0) {
-                       ret = last;
-                       break;
-               }
+       struct btrfs_trans_handle *trans;
+       struct ulist *refs;
+       struct ulist *roots;
+       struct ulist_node *ref_node = NULL;
+       struct ulist_node *root_node = NULL;
+       struct seq_list seq_elem;
+       struct btrfs_delayed_ref_root *delayed_refs;
+
+       trans = btrfs_join_transaction(fs_info->extent_root);
+       if (IS_ERR(trans))
+               return PTR_ERR(trans);
+
+       pr_debug("resolving all inodes for extent %llu\n",
+                       extent_item_objectid);
+
+       delayed_refs = &trans->transaction->delayed_refs;
+       spin_lock(&delayed_refs->lock);
+       btrfs_get_delayed_seq(delayed_refs, &seq_elem);
+       spin_unlock(&delayed_refs->lock);
+
+       ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
+                                  extent_item_pos, seq_elem.seq,
+                                  &refs);
 
-               if (type == BTRFS_EXTENT_DATA_REF_KEY) {
-                       dref = (struct btrfs_extent_data_ref *)(&eiref->offset);
-                       ret = __data_list_add_eb(&data_refs, eb, dref);
-               } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
-                       logical = btrfs_extent_inline_ref_offset(eb, eiref);
-                       ret = __shared_list_add(&shared_refs, logical);
-               }
-       } while (!ret && !last);
+       if (ret)
+               goto out;
 
-       /* ... then we proceed to in-tree references and ... */
-       while (!ret) {
-               ++path->slots[0];
-               if (path->slots[0] > btrfs_header_nritems(eb)) {
-                       ret = btrfs_next_leaf(fs_info->extent_root, path);
-                       if (ret) {
-                               if (ret == 1)
-                                       ret = 0; /* we're done */
-                               break;
-                       }
-                       eb = path->nodes[0];
-               }
-               btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
-               if (key.objectid != extent_item_objectid)
+       while (!ret && (ref_node = ulist_next(refs, ref_node))) {
+               ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, -1,
+                                               seq_elem.seq, &roots);
+               if (ret)
                        break;
-               if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
-                       dref = btrfs_item_ptr(eb, path->slots[0],
-                                               struct btrfs_extent_data_ref);
-                       ret = __data_list_add_eb(&data_refs, eb, dref);
-               } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
-                       ret = __shared_list_add(&shared_refs, key.offset);
+               while (!ret && (root_node = ulist_next(roots, root_node))) {
+                       pr_debug("root %llu references leaf %llu\n",
+                                       root_node->val, ref_node->val);
+                       ret = iterate_leaf_refs(fs_info, path, ref_node->val,
+                                               extent_item_objectid,
+                                               extent_item_pos, root_node->val,
+                                               iterate, ctx);
                }
        }
 
-       btrfs_release_path(path);
-
-       /*
-        * ... only at the very end we can process the refs we found. this is
-        * because the iterator function we call is allowed to make tree lookups
-        * and we have to avoid deadlocks. additionally, we need more tree
-        * lookups ourselves for shared data refs.
-        */
-       while (!list_empty(&data_refs)) {
-               ref_d = list_first_entry(&data_refs, struct __data_ref, list);
-               list_del(&ref_d->list);
-               if (!ret)
-                       ret = iterate(ref_d->inum, extent_offset +
-                                       ref_d->extent_data_item_offset,
-                                       ref_d->root, ctx);
-               kfree(ref_d);
-       }
-
-       while (!list_empty(&shared_refs)) {
-               ref_s = list_first_entry(&shared_refs, struct __shared_ref,
-                                       list);
-               list_del(&ref_s->list);
-               if (!ret)
-                       ret = __iter_shared_inline_ref(fs_info,
-                                                       ref_s->disk_byte,
-                                                       extent_item_objectid,
-                                                       extent_offset, path,
-                                                       &data_refs,
-                                                       iterate, ctx);
-               kfree(ref_s);
-       }
-
+       ulist_free(refs);
+       ulist_free(roots);
+out:
+       btrfs_put_delayed_seq(delayed_refs, &seq_elem);
+       btrfs_end_transaction(trans, fs_info->extent_root);
        return ret;
 }
 
@@ -586,19 +1202,20 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
                                iterate_extent_inodes_t *iterate, void *ctx)
 {
        int ret;
-       u64 offset;
+       u64 extent_item_pos;
        struct btrfs_key found_key;
 
        ret = extent_from_logical(fs_info, logical, path,
                                        &found_key);
+       btrfs_release_path(path);
        if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
                ret = -EINVAL;
        if (ret < 0)
                return ret;
 
-       offset = logical - found_key.objectid;
+       extent_item_pos = logical - found_key.objectid;
        ret = iterate_extent_inodes(fs_info, path, found_key.objectid,
-                                       offset, iterate, ctx);
+                                       extent_item_pos, iterate, ctx);
 
        return ret;
 }
@@ -643,6 +1260,10 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
                for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
                        name_len = btrfs_inode_ref_name_len(eb, iref);
                        /* path must be released before calling iterate()! */
+                       pr_debug("following ref at offset %u for inode %llu in "
+                                "tree %llu\n", cur,
+                                (unsigned long long)found_key.objectid,
+                                (unsigned long long)fs_root->objectid);
                        ret = iterate(parent, iref, eb, ctx);
                        if (ret) {
                                free_extent_buffer(eb);
@@ -683,10 +1304,14 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
                return PTR_ERR(fspath);
 
        if (fspath > fspath_min) {
+               pr_debug("path resolved: %s\n", fspath);
                ipath->fspath->val[i] = (u64)(unsigned long)fspath;
                ++ipath->fspath->elem_cnt;
                ipath->fspath->bytes_left = fspath - fspath_min;
        } else {
+               pr_debug("missed path, not enough space. missing bytes: %lu, "
+                        "constructed so far: %s\n",
+                        (unsigned long)(fspath_min - fspath), fspath_min);
                ++ipath->fspath->elem_missed;
                ipath->fspath->bytes_missing += fspath_min - fspath;
                ipath->fspath->bytes_left = 0;
index 92618837cb8f94a3a799ea3d08093d9b33aab336..d00dfa9ca9342c96f5057af09fb06418cd943cb6 100644 (file)
@@ -20,6 +20,7 @@
 #define __BTRFS_BACKREF__
 
 #include "ioctl.h"
+#include "ulist.h"
 
 struct inode_fs_paths {
        struct btrfs_path               *btrfs_path;
@@ -54,6 +55,10 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
 
 int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
 
+int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+                               struct btrfs_fs_info *fs_info, u64 bytenr,
+                               u64 num_bytes, u64 seq, struct ulist **roots);
+
 struct btrfs_data_container *init_data_container(u32 total_bytes);
 struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
                                        struct btrfs_path *path);
index 634608d2a6d03b5d8741e573de0b142ff8cb6310..9b9b15fd5204347c5ef2931fb186af679cb0d369 100644 (file)
@@ -51,6 +51,9 @@ struct btrfs_inode {
        /* held while logging the inode in tree-log.c */
        struct mutex log_mutex;
 
+       /* held while doing delalloc reservations */
+       struct mutex delalloc_mutex;
+
        /* used to order data wrt metadata */
        struct btrfs_ordered_inode_tree ordered_tree;
 
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
new file mode 100644 (file)
index 0000000..ad0b3ba
--- /dev/null
@@ -0,0 +1,3068 @@
+/*
+ * Copyright (C) STRATO AG 2011.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+/*
+ * This module can be used to catch cases when the btrfs kernel
+ * code executes write requests to the disk that bring the file
+ * system in an inconsistent state. In such a state, a power-loss
+ * or kernel panic event would cause that the data on disk is
+ * lost or at least damaged.
+ *
+ * Code is added that examines all block write requests during
+ * runtime (including writes of the super block). Three rules
+ * are verified and an error is printed on violation of the
+ * rules:
+ * 1. It is not allowed to write a disk block which is
+ *    currently referenced by the super block (either directly
+ *    or indirectly).
+ * 2. When a super block is written, it is verified that all
+ *    referenced (directly or indirectly) blocks fulfill the
+ *    following requirements:
+ *    2a. All referenced blocks have either been present when
+ *        the file system was mounted, (i.e., they have been
+ *        referenced by the super block) or they have been
+ *        written since then and the write completion callback
+ *        was called and a FLUSH request to the device where
+ *        these blocks are located was received and completed.
+ *    2b. All referenced blocks need to have a generation
+ *        number which is equal to the parent's number.
+ *
+ * One issue that was found using this module was that the log
+ * tree on disk became temporarily corrupted because disk blocks
+ * that had been in use for the log tree had been freed and
+ * reused too early, while being referenced by the written super
+ * block.
+ *
+ * The search term in the kernel log that can be used to filter
+ * on the existence of detected integrity issues is
+ * "btrfs: attempt".
+ *
+ * The integrity check is enabled via mount options. These
+ * mount options are only supported if the integrity check
+ * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY.
+ *
+ * Example #1, apply integrity checks to all metadata:
+ * mount /dev/sdb1 /mnt -o check_int
+ *
+ * Example #2, apply integrity checks to all metadata and
+ * to data extents:
+ * mount /dev/sdb1 /mnt -o check_int_data
+ *
+ * Example #3, apply integrity checks to all metadata and dump
+ * the tree that the super block references to kernel messages
+ * each time after a super block was written:
+ * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263
+ *
+ * If the integrity check tool is included and activated in
+ * the mount options, plenty of kernel memory is used, and
+ * plenty of additional CPU cycles are spent. Enabling this
+ * functionality is not intended for normal use. In most
+ * cases, unless you are a btrfs developer who needs to verify
+ * the integrity of (super)-block write requests, do not
+ * enable the config option BTRFS_FS_CHECK_INTEGRITY to
+ * include and compile the integrity check tool.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/mutex.h>
+#include <linux/crc32c.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "extent_io.h"
+#include "disk-io.h"
+#include "volumes.h"
+#include "print-tree.h"
+#include "locking.h"
+#include "check-integrity.h"
+
+#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000
+#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000
+#define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100
+#define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051
+#define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807
+#define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530
+#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300
+#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6)   /* in characters,
+                                                        * excluding " [...]" */
+#define BTRFSIC_BLOCK_SIZE PAGE_SIZE
+
+#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1)
+
+/*
+ * The definition of the bitmask fields for the print_mask.
+ * They are specified with the mount option check_integrity_print_mask.
+ */
+#define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE                    0x00000001
+#define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION                0x00000002
+#define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE                 0x00000004
+#define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE                        0x00000008
+#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH                       0x00000010
+#define BTRFSIC_PRINT_MASK_END_IO_BIO_BH                       0x00000020
+#define BTRFSIC_PRINT_MASK_VERBOSE                             0x00000040
+#define BTRFSIC_PRINT_MASK_VERY_VERBOSE                                0x00000080
+#define BTRFSIC_PRINT_MASK_INITIAL_TREE                                0x00000100
+#define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES                   0x00000200
+#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE                    0x00000400
+#define BTRFSIC_PRINT_MASK_NUM_COPIES                          0x00000800
+#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS               0x00001000
+
+struct btrfsic_dev_state;
+struct btrfsic_state;
+
+struct btrfsic_block {
+       u32 magic_num;          /* only used for debug purposes */
+       unsigned int is_metadata:1;     /* if it is meta-data, not data-data */
+       unsigned int is_superblock:1;   /* if it is one of the superblocks */
+       unsigned int is_iodone:1;       /* if is done by lower subsystem */
+       unsigned int iodone_w_error:1;  /* error was indicated to endio */
+       unsigned int never_written:1;   /* block was added because it was
+                                        * referenced, not because it was
+                                        * written */
+       unsigned int mirror_num:2;      /* large enough to hold
+                                        * BTRFS_SUPER_MIRROR_MAX */
+       struct btrfsic_dev_state *dev_state;
+       u64 dev_bytenr;         /* key, physical byte num on disk */
+       u64 logical_bytenr;     /* logical byte num on disk */
+       u64 generation;
+       struct btrfs_disk_key disk_key; /* extra info to print in case of
+                                        * issues, will not always be correct */
+       struct list_head collision_resolving_node;      /* list node */
+       struct list_head all_blocks_node;       /* list node */
+
+       /* the following two lists contain block_link items */
+       struct list_head ref_to_list;   /* list */
+       struct list_head ref_from_list; /* list */
+       struct btrfsic_block *next_in_same_bio;
+       void *orig_bio_bh_private;
+       union {
+               bio_end_io_t *bio;
+               bh_end_io_t *bh;
+       } orig_bio_bh_end_io;
+       int submit_bio_bh_rw;
+       u64 flush_gen; /* only valid if !never_written */
+};
+
+/*
+ * Elements of this type are allocated dynamically and required because
+ * each block object can refer to and can be ref from multiple blocks.
+ * The key to lookup them in the hashtable is the dev_bytenr of
+ * the block ref to plus the one from the block refered from.
+ * The fact that they are searchable via a hashtable and that a
+ * ref_cnt is maintained is not required for the btrfs integrity
+ * check algorithm itself, it is only used to make the output more
+ * beautiful in case that an error is detected (an error is defined
+ * as a write operation to a block while that block is still referenced).
+ */
+struct btrfsic_block_link {
+       u32 magic_num;          /* only used for debug purposes */
+       u32 ref_cnt;
+       struct list_head node_ref_to;   /* list node */
+       struct list_head node_ref_from; /* list node */
+       struct list_head collision_resolving_node;      /* list node */
+       struct btrfsic_block *block_ref_to;
+       struct btrfsic_block *block_ref_from;
+       u64 parent_generation;
+};
+
+struct btrfsic_dev_state {
+       u32 magic_num;          /* only used for debug purposes */
+       struct block_device *bdev;
+       struct btrfsic_state *state;
+       struct list_head collision_resolving_node;      /* list node */
+       struct btrfsic_block dummy_block_for_bio_bh_flush;
+       u64 last_flush_gen;
+       char name[BDEVNAME_SIZE];
+};
+
+struct btrfsic_block_hashtable {
+       struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE];
+};
+
+struct btrfsic_block_link_hashtable {
+       struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE];
+};
+
+struct btrfsic_dev_state_hashtable {
+       struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE];
+};
+
+struct btrfsic_block_data_ctx {
+       u64 start;              /* virtual bytenr */
+       u64 dev_bytenr;         /* physical bytenr on device */
+       u32 len;
+       struct btrfsic_dev_state *dev;
+       char *data;
+       struct buffer_head *bh; /* do not use if set to NULL */
+};
+
+/* This structure is used to implement recursion without occupying
+ * any stack space, refer to btrfsic_process_metablock() */
+struct btrfsic_stack_frame {
+       u32 magic;
+       u32 nr;
+       int error;
+       int i;
+       int limit_nesting;
+       int num_copies;
+       int mirror_num;
+       struct btrfsic_block *block;
+       struct btrfsic_block_data_ctx *block_ctx;
+       struct btrfsic_block *next_block;
+       struct btrfsic_block_data_ctx next_block_ctx;
+       struct btrfs_header *hdr;
+       struct btrfsic_stack_frame *prev;
+};
+
+/* Some state per mounted filesystem */
+struct btrfsic_state {
+       u32 print_mask;
+       int include_extent_data;
+       int csum_size;
+       struct list_head all_blocks_list;
+       struct btrfsic_block_hashtable block_hashtable;
+       struct btrfsic_block_link_hashtable block_link_hashtable;
+       struct btrfs_root *root;
+       u64 max_superblock_generation;
+       struct btrfsic_block *latest_superblock;
+};
+
+static void btrfsic_block_init(struct btrfsic_block *b);
+static struct btrfsic_block *btrfsic_block_alloc(void);
+static void btrfsic_block_free(struct btrfsic_block *b);
+static void btrfsic_block_link_init(struct btrfsic_block_link *n);
+static struct btrfsic_block_link *btrfsic_block_link_alloc(void);
+static void btrfsic_block_link_free(struct btrfsic_block_link *n);
+static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds);
+static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void);
+static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds);
+static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h);
+static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
+                                       struct btrfsic_block_hashtable *h);
+static void btrfsic_block_hashtable_remove(struct btrfsic_block *b);
+static struct btrfsic_block *btrfsic_block_hashtable_lookup(
+               struct block_device *bdev,
+               u64 dev_bytenr,
+               struct btrfsic_block_hashtable *h);
+static void btrfsic_block_link_hashtable_init(
+               struct btrfsic_block_link_hashtable *h);
+static void btrfsic_block_link_hashtable_add(
+               struct btrfsic_block_link *l,
+               struct btrfsic_block_link_hashtable *h);
+static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l);
+static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
+               struct block_device *bdev_ref_to,
+               u64 dev_bytenr_ref_to,
+               struct block_device *bdev_ref_from,
+               u64 dev_bytenr_ref_from,
+               struct btrfsic_block_link_hashtable *h);
+static void btrfsic_dev_state_hashtable_init(
+               struct btrfsic_dev_state_hashtable *h);
+static void btrfsic_dev_state_hashtable_add(
+               struct btrfsic_dev_state *ds,
+               struct btrfsic_dev_state_hashtable *h);
+static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds);
+static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
+               struct block_device *bdev,
+               struct btrfsic_dev_state_hashtable *h);
+static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void);
+static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf);
+static int btrfsic_process_superblock(struct btrfsic_state *state,
+                                     struct btrfs_fs_devices *fs_devices);
+static int btrfsic_process_metablock(struct btrfsic_state *state,
+                                    struct btrfsic_block *block,
+                                    struct btrfsic_block_data_ctx *block_ctx,
+                                    struct btrfs_header *hdr,
+                                    int limit_nesting, int force_iodone_flag);
+static int btrfsic_create_link_to_next_block(
+               struct btrfsic_state *state,
+               struct btrfsic_block *block,
+               struct btrfsic_block_data_ctx
+               *block_ctx, u64 next_bytenr,
+               int limit_nesting,
+               struct btrfsic_block_data_ctx *next_block_ctx,
+               struct btrfsic_block **next_blockp,
+               int force_iodone_flag,
+               int *num_copiesp, int *mirror_nump,
+               struct btrfs_disk_key *disk_key,
+               u64 parent_generation);
+static int btrfsic_handle_extent_data(struct btrfsic_state *state,
+                                     struct btrfsic_block *block,
+                                     struct btrfsic_block_data_ctx *block_ctx,
+                                     u32 item_offset, int force_iodone_flag);
+static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
+                            struct btrfsic_block_data_ctx *block_ctx_out,
+                            int mirror_num);
+static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
+                                 u32 len, struct block_device *bdev,
+                                 struct btrfsic_block_data_ctx *block_ctx_out);
+static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
+static int btrfsic_read_block(struct btrfsic_state *state,
+                             struct btrfsic_block_data_ctx *block_ctx);
+static void btrfsic_dump_database(struct btrfsic_state *state);
+static int btrfsic_test_for_metadata(struct btrfsic_state *state,
+                                    const u8 *data, unsigned int size);
+static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
+                                         u64 dev_bytenr, u8 *mapped_data,
+                                         unsigned int len, struct bio *bio,
+                                         int *bio_is_patched,
+                                         struct buffer_head *bh,
+                                         int submit_bio_bh_rw);
+static int btrfsic_process_written_superblock(
+               struct btrfsic_state *state,
+               struct btrfsic_block *const block,
+               struct btrfs_super_block *const super_hdr);
+static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status);
+static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate);
+static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state,
+                                             const struct btrfsic_block *block,
+                                             int recursion_level);
+static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
+                                       struct btrfsic_block *const block,
+                                       int recursion_level);
+static void btrfsic_print_add_link(const struct btrfsic_state *state,
+                                  const struct btrfsic_block_link *l);
+static void btrfsic_print_rem_link(const struct btrfsic_state *state,
+                                  const struct btrfsic_block_link *l);
+static char btrfsic_get_block_type(const struct btrfsic_state *state,
+                                  const struct btrfsic_block *block);
+static void btrfsic_dump_tree(const struct btrfsic_state *state);
+static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
+                                 const struct btrfsic_block *block,
+                                 int indent_level);
+static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
+               struct btrfsic_state *state,
+               struct btrfsic_block_data_ctx *next_block_ctx,
+               struct btrfsic_block *next_block,
+               struct btrfsic_block *from_block,
+               u64 parent_generation);
+static struct btrfsic_block *btrfsic_block_lookup_or_add(
+               struct btrfsic_state *state,
+               struct btrfsic_block_data_ctx *block_ctx,
+               const char *additional_string,
+               int is_metadata,
+               int is_iodone,
+               int never_written,
+               int mirror_num,
+               int *was_created);
+static int btrfsic_process_superblock_dev_mirror(
+               struct btrfsic_state *state,
+               struct btrfsic_dev_state *dev_state,
+               struct btrfs_device *device,
+               int superblock_mirror_num,
+               struct btrfsic_dev_state **selected_dev_state,
+               struct btrfs_super_block *selected_super);
+static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
+               struct block_device *bdev);
+static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
+                                          u64 bytenr,
+                                          struct btrfsic_dev_state *dev_state,
+                                          u64 dev_bytenr, char *data);
+
+static struct mutex btrfsic_mutex;
+static int btrfsic_is_initialized;
+static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable;
+
+
+static void btrfsic_block_init(struct btrfsic_block *b)
+{
+       b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER;
+       b->dev_state = NULL;
+       b->dev_bytenr = 0;
+       b->logical_bytenr = 0;
+       b->generation = BTRFSIC_GENERATION_UNKNOWN;
+       b->disk_key.objectid = 0;
+       b->disk_key.type = 0;
+       b->disk_key.offset = 0;
+       b->is_metadata = 0;
+       b->is_superblock = 0;
+       b->is_iodone = 0;
+       b->iodone_w_error = 0;
+       b->never_written = 0;
+       b->mirror_num = 0;
+       b->next_in_same_bio = NULL;
+       b->orig_bio_bh_private = NULL;
+       b->orig_bio_bh_end_io.bio = NULL;
+       INIT_LIST_HEAD(&b->collision_resolving_node);
+       INIT_LIST_HEAD(&b->all_blocks_node);
+       INIT_LIST_HEAD(&b->ref_to_list);
+       INIT_LIST_HEAD(&b->ref_from_list);
+       b->submit_bio_bh_rw = 0;
+       b->flush_gen = 0;
+}
+
+static struct btrfsic_block *btrfsic_block_alloc(void)
+{
+       struct btrfsic_block *b;
+
+       b = kzalloc(sizeof(*b), GFP_NOFS);
+       if (NULL != b)
+               btrfsic_block_init(b);
+
+       return b;
+}
+
+static void btrfsic_block_free(struct btrfsic_block *b)
+{
+       BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num));
+       kfree(b);
+}
+
+static void btrfsic_block_link_init(struct btrfsic_block_link *l)
+{
+       l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER;
+       l->ref_cnt = 1;
+       INIT_LIST_HEAD(&l->node_ref_to);
+       INIT_LIST_HEAD(&l->node_ref_from);
+       INIT_LIST_HEAD(&l->collision_resolving_node);
+       l->block_ref_to = NULL;
+       l->block_ref_from = NULL;
+}
+
+static struct btrfsic_block_link *btrfsic_block_link_alloc(void)
+{
+       struct btrfsic_block_link *l;
+
+       l = kzalloc(sizeof(*l), GFP_NOFS);
+       if (NULL != l)
+               btrfsic_block_link_init(l);
+
+       return l;
+}
+
+static void btrfsic_block_link_free(struct btrfsic_block_link *l)
+{
+       BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num));
+       kfree(l);
+}
+
+static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds)
+{
+       ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER;
+       ds->bdev = NULL;
+       ds->state = NULL;
+       ds->name[0] = '\0';
+       INIT_LIST_HEAD(&ds->collision_resolving_node);
+       ds->last_flush_gen = 0;
+       btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush);
+       ds->dummy_block_for_bio_bh_flush.is_iodone = 1;
+       ds->dummy_block_for_bio_bh_flush.dev_state = ds;
+}
+
+static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void)
+{
+       struct btrfsic_dev_state *ds;
+
+       ds = kzalloc(sizeof(*ds), GFP_NOFS);
+       if (NULL != ds)
+               btrfsic_dev_state_init(ds);
+
+       return ds;
+}
+
+static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds)
+{
+       BUG_ON(!(NULL == ds ||
+                BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num));
+       kfree(ds);
+}
+
+static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h)
+{
+       int i;
+
+       for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++)
+               INIT_LIST_HEAD(h->table + i);
+}
+
+static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
+                                       struct btrfsic_block_hashtable *h)
+{
+       const unsigned int hashval =
+           (((unsigned int)(b->dev_bytenr >> 16)) ^
+            ((unsigned int)((uintptr_t)b->dev_state->bdev))) &
+            (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
+
+       list_add(&b->collision_resolving_node, h->table + hashval);
+}
+
+static void btrfsic_block_hashtable_remove(struct btrfsic_block *b)
+{
+       list_del(&b->collision_resolving_node);
+}
+
+static struct btrfsic_block *btrfsic_block_hashtable_lookup(
+               struct block_device *bdev,
+               u64 dev_bytenr,
+               struct btrfsic_block_hashtable *h)
+{
+       const unsigned int hashval =
+           (((unsigned int)(dev_bytenr >> 16)) ^
+            ((unsigned int)((uintptr_t)bdev))) &
+            (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
+       struct list_head *elem;
+
+       list_for_each(elem, h->table + hashval) {
+               struct btrfsic_block *const b =
+                   list_entry(elem, struct btrfsic_block,
+                              collision_resolving_node);
+
+               if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr)
+                       return b;
+       }
+
+       return NULL;
+}
+
+static void btrfsic_block_link_hashtable_init(
+               struct btrfsic_block_link_hashtable *h)
+{
+       int i;
+
+       for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++)
+               INIT_LIST_HEAD(h->table + i);
+}
+
+static void btrfsic_block_link_hashtable_add(
+               struct btrfsic_block_link *l,
+               struct btrfsic_block_link_hashtable *h)
+{
+       const unsigned int hashval =
+           (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^
+            ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^
+            ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^
+            ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev)))
+            & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
+
+       BUG_ON(NULL == l->block_ref_to);
+       BUG_ON(NULL == l->block_ref_from);
+       list_add(&l->collision_resolving_node, h->table + hashval);
+}
+
+static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l)
+{
+       list_del(&l->collision_resolving_node);
+}
+
+static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
+               struct block_device *bdev_ref_to,
+               u64 dev_bytenr_ref_to,
+               struct block_device *bdev_ref_from,
+               u64 dev_bytenr_ref_from,
+               struct btrfsic_block_link_hashtable *h)
+{
+       const unsigned int hashval =
+           (((unsigned int)(dev_bytenr_ref_to >> 16)) ^
+            ((unsigned int)(dev_bytenr_ref_from >> 16)) ^
+            ((unsigned int)((uintptr_t)bdev_ref_to)) ^
+            ((unsigned int)((uintptr_t)bdev_ref_from))) &
+            (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
+       struct list_head *elem;
+
+       list_for_each(elem, h->table + hashval) {
+               struct btrfsic_block_link *const l =
+                   list_entry(elem, struct btrfsic_block_link,
+                              collision_resolving_node);
+
+               BUG_ON(NULL == l->block_ref_to);
+               BUG_ON(NULL == l->block_ref_from);
+               if (l->block_ref_to->dev_state->bdev == bdev_ref_to &&
+                   l->block_ref_to->dev_bytenr == dev_bytenr_ref_to &&
+                   l->block_ref_from->dev_state->bdev == bdev_ref_from &&
+                   l->block_ref_from->dev_bytenr == dev_bytenr_ref_from)
+                       return l;
+       }
+
+       return NULL;
+}
+
+static void btrfsic_dev_state_hashtable_init(
+               struct btrfsic_dev_state_hashtable *h)
+{
+       int i;
+
+       for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++)
+               INIT_LIST_HEAD(h->table + i);
+}
+
+static void btrfsic_dev_state_hashtable_add(
+               struct btrfsic_dev_state *ds,
+               struct btrfsic_dev_state_hashtable *h)
+{
+       const unsigned int hashval =
+           (((unsigned int)((uintptr_t)ds->bdev)) &
+            (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
+
+       list_add(&ds->collision_resolving_node, h->table + hashval);
+}
+
+static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds)
+{
+       list_del(&ds->collision_resolving_node);
+}
+
+static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
+               struct block_device *bdev,
+               struct btrfsic_dev_state_hashtable *h)
+{
+       const unsigned int hashval =
+           (((unsigned int)((uintptr_t)bdev)) &
+            (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
+       struct list_head *elem;
+
+       list_for_each(elem, h->table + hashval) {
+               struct btrfsic_dev_state *const ds =
+                   list_entry(elem, struct btrfsic_dev_state,
+                              collision_resolving_node);
+
+               if (ds->bdev == bdev)
+                       return ds;
+       }
+
+       return NULL;
+}
+
+static int btrfsic_process_superblock(struct btrfsic_state *state,
+                                     struct btrfs_fs_devices *fs_devices)
+{
+       int ret;
+       struct btrfs_super_block *selected_super;
+       struct list_head *dev_head = &fs_devices->devices;
+       struct btrfs_device *device;
+       struct btrfsic_dev_state *selected_dev_state = NULL;
+       int pass;
+
+       BUG_ON(NULL == state);
+       selected_super = kmalloc(sizeof(*selected_super), GFP_NOFS);
+       if (NULL == selected_super) {
+               printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+               return -1;
+       }
+
+       list_for_each_entry(device, dev_head, dev_list) {
+               int i;
+               struct btrfsic_dev_state *dev_state;
+
+               if (!device->bdev || !device->name)
+                       continue;
+
+               dev_state = btrfsic_dev_state_lookup(device->bdev);
+               BUG_ON(NULL == dev_state);
+               for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+                       ret = btrfsic_process_superblock_dev_mirror(
+                                       state, dev_state, device, i,
+                                       &selected_dev_state, selected_super);
+                       if (0 != ret && 0 == i) {
+                               kfree(selected_super);
+                               return ret;
+                       }
+               }
+       }
+
+       if (NULL == state->latest_superblock) {
+               printk(KERN_INFO "btrfsic: no superblock found!\n");
+               kfree(selected_super);
+               return -1;
+       }
+
+       state->csum_size = btrfs_super_csum_size(selected_super);
+
+       for (pass = 0; pass < 3; pass++) {
+               int num_copies;
+               int mirror_num;
+               u64 next_bytenr;
+
+               switch (pass) {
+               case 0:
+                       next_bytenr = btrfs_super_root(selected_super);
+                       if (state->print_mask &
+                           BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+                               printk(KERN_INFO "root@%llu\n",
+                                      (unsigned long long)next_bytenr);
+                       break;
+               case 1:
+                       next_bytenr = btrfs_super_chunk_root(selected_super);
+                       if (state->print_mask &
+                           BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+                               printk(KERN_INFO "chunk@%llu\n",
+                                      (unsigned long long)next_bytenr);
+                       break;
+               case 2:
+                       next_bytenr = btrfs_super_log_root(selected_super);
+                       if (0 == next_bytenr)
+                               continue;
+                       if (state->print_mask &
+                           BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+                               printk(KERN_INFO "log@%llu\n",
+                                      (unsigned long long)next_bytenr);
+                       break;
+               }
+
+               num_copies =
+                   btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                                    next_bytenr, PAGE_SIZE);
+               if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+                       printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+                              (unsigned long long)next_bytenr, num_copies);
+
+               for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+                       struct btrfsic_block *next_block;
+                       struct btrfsic_block_data_ctx tmp_next_block_ctx;
+                       struct btrfsic_block_link *l;
+                       struct btrfs_header *hdr;
+
+                       ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+                                               &tmp_next_block_ctx,
+                                               mirror_num);
+                       if (ret) {
+                               printk(KERN_INFO "btrfsic:"
+                                      " btrfsic_map_block(root @%llu,"
+                                      " mirror %d) failed!\n",
+                                      (unsigned long long)next_bytenr,
+                                      mirror_num);
+                               kfree(selected_super);
+                               return -1;
+                       }
+
+                       next_block = btrfsic_block_hashtable_lookup(
+                                       tmp_next_block_ctx.dev->bdev,
+                                       tmp_next_block_ctx.dev_bytenr,
+                                       &state->block_hashtable);
+                       BUG_ON(NULL == next_block);
+
+                       l = btrfsic_block_link_hashtable_lookup(
+                                       tmp_next_block_ctx.dev->bdev,
+                                       tmp_next_block_ctx.dev_bytenr,
+                                       state->latest_superblock->dev_state->
+                                       bdev,
+                                       state->latest_superblock->dev_bytenr,
+                                       &state->block_link_hashtable);
+                       BUG_ON(NULL == l);
+
+                       ret = btrfsic_read_block(state, &tmp_next_block_ctx);
+                       if (ret < (int)BTRFSIC_BLOCK_SIZE) {
+                               printk(KERN_INFO
+                                      "btrfsic: read @logical %llu failed!\n",
+                                      (unsigned long long)
+                                      tmp_next_block_ctx.start);
+                               btrfsic_release_block_ctx(&tmp_next_block_ctx);
+                               kfree(selected_super);
+                               return -1;
+                       }
+
+                       hdr = (struct btrfs_header *)tmp_next_block_ctx.data;
+                       ret = btrfsic_process_metablock(state,
+                                                       next_block,
+                                                       &tmp_next_block_ctx,
+                                                       hdr,
+                                                       BTRFS_MAX_LEVEL + 3, 1);
+                       btrfsic_release_block_ctx(&tmp_next_block_ctx);
+               }
+       }
+
+       kfree(selected_super);
+       return ret;
+}
+
+static int btrfsic_process_superblock_dev_mirror(
+               struct btrfsic_state *state,
+               struct btrfsic_dev_state *dev_state,
+               struct btrfs_device *device,
+               int superblock_mirror_num,
+               struct btrfsic_dev_state **selected_dev_state,
+               struct btrfs_super_block *selected_super)
+{
+       struct btrfs_super_block *super_tmp;
+       u64 dev_bytenr;
+       struct buffer_head *bh;
+       struct btrfsic_block *superblock_tmp;
+       int pass;
+       struct block_device *const superblock_bdev = device->bdev;
+
+       /* super block bytenr is always the unmapped device bytenr */
+       dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
+       bh = __bread(superblock_bdev, dev_bytenr / 4096, 4096);
+       if (NULL == bh)
+               return -1;
+       super_tmp = (struct btrfs_super_block *)
+           (bh->b_data + (dev_bytenr & 4095));
+
+       if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
+           strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC,
+                   sizeof(super_tmp->magic)) ||
+           memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE)) {
+               brelse(bh);
+               return 0;
+       }
+
+       superblock_tmp =
+           btrfsic_block_hashtable_lookup(superblock_bdev,
+                                          dev_bytenr,
+                                          &state->block_hashtable);
+       if (NULL == superblock_tmp) {
+               superblock_tmp = btrfsic_block_alloc();
+               if (NULL == superblock_tmp) {
+                       printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+                       brelse(bh);
+                       return -1;
+               }
+               /* for superblock, only the dev_bytenr makes sense */
+               superblock_tmp->dev_bytenr = dev_bytenr;
+               superblock_tmp->dev_state = dev_state;
+               superblock_tmp->logical_bytenr = dev_bytenr;
+               superblock_tmp->generation = btrfs_super_generation(super_tmp);
+               superblock_tmp->is_metadata = 1;
+               superblock_tmp->is_superblock = 1;
+               superblock_tmp->is_iodone = 1;
+               superblock_tmp->never_written = 0;
+               superblock_tmp->mirror_num = 1 + superblock_mirror_num;
+               if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+                       printk(KERN_INFO "New initial S-block (bdev %p, %s)"
+                              " @%llu (%s/%llu/%d)\n",
+                              superblock_bdev, device->name,
+                              (unsigned long long)dev_bytenr,
+                              dev_state->name,
+                              (unsigned long long)dev_bytenr,
+                              superblock_mirror_num);
+               list_add(&superblock_tmp->all_blocks_node,
+                        &state->all_blocks_list);
+               btrfsic_block_hashtable_add(superblock_tmp,
+                                           &state->block_hashtable);
+       }
+
+       /* select the one with the highest generation field */
+       if (btrfs_super_generation(super_tmp) >
+           state->max_superblock_generation ||
+           0 == state->max_superblock_generation) {
+               memcpy(selected_super, super_tmp, sizeof(*selected_super));
+               *selected_dev_state = dev_state;
+               state->max_superblock_generation =
+                   btrfs_super_generation(super_tmp);
+               state->latest_superblock = superblock_tmp;
+       }
+
+       for (pass = 0; pass < 3; pass++) {
+               u64 next_bytenr;
+               int num_copies;
+               int mirror_num;
+               const char *additional_string = NULL;
+               struct btrfs_disk_key tmp_disk_key;
+
+               tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
+               tmp_disk_key.offset = 0;
+               switch (pass) {
+               case 0:
+                       tmp_disk_key.objectid =
+                           cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID);
+                       additional_string = "initial root ";
+                       next_bytenr = btrfs_super_root(super_tmp);
+                       break;
+               case 1:
+                       tmp_disk_key.objectid =
+                           cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID);
+                       additional_string = "initial chunk ";
+                       next_bytenr = btrfs_super_chunk_root(super_tmp);
+                       break;
+               case 2:
+                       tmp_disk_key.objectid =
+                           cpu_to_le64(BTRFS_TREE_LOG_OBJECTID);
+                       additional_string = "initial log ";
+                       next_bytenr = btrfs_super_log_root(super_tmp);
+                       if (0 == next_bytenr)
+                               continue;
+                       break;
+               }
+
+               num_copies =
+                   btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                                    next_bytenr, PAGE_SIZE);
+               if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+                       printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+                              (unsigned long long)next_bytenr, num_copies);
+               for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+                       struct btrfsic_block *next_block;
+                       struct btrfsic_block_data_ctx tmp_next_block_ctx;
+                       struct btrfsic_block_link *l;
+
+                       if (btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+                                             &tmp_next_block_ctx,
+                                             mirror_num)) {
+                               printk(KERN_INFO "btrfsic: btrfsic_map_block("
+                                      "bytenr @%llu, mirror %d) failed!\n",
+                                      (unsigned long long)next_bytenr,
+                                      mirror_num);
+                               brelse(bh);
+                               return -1;
+                       }
+
+                       next_block = btrfsic_block_lookup_or_add(
+                                       state, &tmp_next_block_ctx,
+                                       additional_string, 1, 1, 0,
+                                       mirror_num, NULL);
+                       if (NULL == next_block) {
+                               btrfsic_release_block_ctx(&tmp_next_block_ctx);
+                               brelse(bh);
+                               return -1;
+                       }
+
+                       next_block->disk_key = tmp_disk_key;
+                       next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
+                       l = btrfsic_block_link_lookup_or_add(
+                                       state, &tmp_next_block_ctx,
+                                       next_block, superblock_tmp,
+                                       BTRFSIC_GENERATION_UNKNOWN);
+                       btrfsic_release_block_ctx(&tmp_next_block_ctx);
+                       if (NULL == l) {
+                               brelse(bh);
+                               return -1;
+                       }
+               }
+       }
+       if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES)
+               btrfsic_dump_tree_sub(state, superblock_tmp, 0);
+
+       brelse(bh);
+       return 0;
+}
+
+static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void)
+{
+       struct btrfsic_stack_frame *sf;
+
+       sf = kzalloc(sizeof(*sf), GFP_NOFS);
+       if (NULL == sf)
+               printk(KERN_INFO "btrfsic: alloc memory failed!\n");
+       else
+               sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER;
+       return sf;
+}
+
+static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf)
+{
+       BUG_ON(!(NULL == sf ||
+                BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic));
+       kfree(sf);
+}
+
+static int btrfsic_process_metablock(
+               struct btrfsic_state *state,
+               struct btrfsic_block *const first_block,
+               struct btrfsic_block_data_ctx *const first_block_ctx,
+               struct btrfs_header *const first_hdr,
+               int first_limit_nesting, int force_iodone_flag)
+{
+       struct btrfsic_stack_frame initial_stack_frame = { 0 };
+       struct btrfsic_stack_frame *sf;
+       struct btrfsic_stack_frame *next_stack;
+
+       sf = &initial_stack_frame;
+       sf->error = 0;
+       sf->i = -1;
+       sf->limit_nesting = first_limit_nesting;
+       sf->block = first_block;
+       sf->block_ctx = first_block_ctx;
+       sf->next_block = NULL;
+       sf->hdr = first_hdr;
+       sf->prev = NULL;
+
+continue_with_new_stack_frame:
+       sf->block->generation = le64_to_cpu(sf->hdr->generation);
+       if (0 == sf->hdr->level) {
+               struct btrfs_leaf *const leafhdr =
+                   (struct btrfs_leaf *)sf->hdr;
+
+               if (-1 == sf->i) {
+                       sf->nr = le32_to_cpu(leafhdr->header.nritems);
+
+                       if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                               printk(KERN_INFO
+                                      "leaf %llu items %d generation %llu"
+                                      " owner %llu\n",
+                                      (unsigned long long)
+                                      sf->block_ctx->start,
+                                      sf->nr,
+                                      (unsigned long long)
+                                      le64_to_cpu(leafhdr->header.generation),
+                                      (unsigned long long)
+                                      le64_to_cpu(leafhdr->header.owner));
+               }
+
+continue_with_current_leaf_stack_frame:
+               if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
+                       sf->i++;
+                       sf->num_copies = 0;
+               }
+
+               if (sf->i < sf->nr) {
+                       struct btrfs_item *disk_item = leafhdr->items + sf->i;
+                       struct btrfs_disk_key *disk_key = &disk_item->key;
+                       u8 type;
+                       const u32 item_offset = le32_to_cpu(disk_item->offset);
+
+                       type = disk_key->type;
+
+                       if (BTRFS_ROOT_ITEM_KEY == type) {
+                               const struct btrfs_root_item *const root_item =
+                                   (struct btrfs_root_item *)
+                                   (sf->block_ctx->data +
+                                    offsetof(struct btrfs_leaf, items) +
+                                    item_offset);
+                               const u64 next_bytenr =
+                                   le64_to_cpu(root_item->bytenr);
+
+                               sf->error =
+                                   btrfsic_create_link_to_next_block(
+                                               state,
+                                               sf->block,
+                                               sf->block_ctx,
+                                               next_bytenr,
+                                               sf->limit_nesting,
+                                               &sf->next_block_ctx,
+                                               &sf->next_block,
+                                               force_iodone_flag,
+                                               &sf->num_copies,
+                                               &sf->mirror_num,
+                                               disk_key,
+                                               le64_to_cpu(root_item->
+                                               generation));
+                               if (sf->error)
+                                       goto one_stack_frame_backwards;
+
+                               if (NULL != sf->next_block) {
+                                       struct btrfs_header *const next_hdr =
+                                           (struct btrfs_header *)
+                                           sf->next_block_ctx.data;
+
+                                       next_stack =
+                                           btrfsic_stack_frame_alloc();
+                                       if (NULL == next_stack) {
+                                               btrfsic_release_block_ctx(
+                                                               &sf->
+                                                               next_block_ctx);
+                                               goto one_stack_frame_backwards;
+                                       }
+
+                                       next_stack->i = -1;
+                                       next_stack->block = sf->next_block;
+                                       next_stack->block_ctx =
+                                           &sf->next_block_ctx;
+                                       next_stack->next_block = NULL;
+                                       next_stack->hdr = next_hdr;
+                                       next_stack->limit_nesting =
+                                           sf->limit_nesting - 1;
+                                       next_stack->prev = sf;
+                                       sf = next_stack;
+                                       goto continue_with_new_stack_frame;
+                               }
+                       } else if (BTRFS_EXTENT_DATA_KEY == type &&
+                                  state->include_extent_data) {
+                               sf->error = btrfsic_handle_extent_data(
+                                               state,
+                                               sf->block,
+                                               sf->block_ctx,
+                                               item_offset,
+                                               force_iodone_flag);
+                               if (sf->error)
+                                       goto one_stack_frame_backwards;
+                       }
+
+                       goto continue_with_current_leaf_stack_frame;
+               }
+       } else {
+               struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr;
+
+               if (-1 == sf->i) {
+                       sf->nr = le32_to_cpu(nodehdr->header.nritems);
+
+                       if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                               printk(KERN_INFO "node %llu level %d items %d"
+                                      " generation %llu owner %llu\n",
+                                      (unsigned long long)
+                                      sf->block_ctx->start,
+                                      nodehdr->header.level, sf->nr,
+                                      (unsigned long long)
+                                      le64_to_cpu(nodehdr->header.generation),
+                                      (unsigned long long)
+                                      le64_to_cpu(nodehdr->header.owner));
+               }
+
+continue_with_current_node_stack_frame:
+               if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
+                       sf->i++;
+                       sf->num_copies = 0;
+               }
+
+               if (sf->i < sf->nr) {
+                       struct btrfs_key_ptr *disk_key_ptr =
+                           nodehdr->ptrs + sf->i;
+                       const u64 next_bytenr =
+                           le64_to_cpu(disk_key_ptr->blockptr);
+
+                       sf->error = btrfsic_create_link_to_next_block(
+                                       state,
+                                       sf->block,
+                                       sf->block_ctx,
+                                       next_bytenr,
+                                       sf->limit_nesting,
+                                       &sf->next_block_ctx,
+                                       &sf->next_block,
+                                       force_iodone_flag,
+                                       &sf->num_copies,
+                                       &sf->mirror_num,
+                                       &disk_key_ptr->key,
+                                       le64_to_cpu(disk_key_ptr->generation));
+                       if (sf->error)
+                               goto one_stack_frame_backwards;
+
+                       if (NULL != sf->next_block) {
+                               struct btrfs_header *const next_hdr =
+                                   (struct btrfs_header *)
+                                   sf->next_block_ctx.data;
+
+                               next_stack = btrfsic_stack_frame_alloc();
+                               if (NULL == next_stack)
+                                       goto one_stack_frame_backwards;
+
+                               next_stack->i = -1;
+                               next_stack->block = sf->next_block;
+                               next_stack->block_ctx = &sf->next_block_ctx;
+                               next_stack->next_block = NULL;
+                               next_stack->hdr = next_hdr;
+                               next_stack->limit_nesting =
+                                   sf->limit_nesting - 1;
+                               next_stack->prev = sf;
+                               sf = next_stack;
+                               goto continue_with_new_stack_frame;
+                       }
+
+                       goto continue_with_current_node_stack_frame;
+               }
+       }
+
+one_stack_frame_backwards:
+       if (NULL != sf->prev) {
+               struct btrfsic_stack_frame *const prev = sf->prev;
+
+               /* the one for the initial block is freed in the caller */
+               btrfsic_release_block_ctx(sf->block_ctx);
+
+               if (sf->error) {
+                       prev->error = sf->error;
+                       btrfsic_stack_frame_free(sf);
+                       sf = prev;
+                       goto one_stack_frame_backwards;
+               }
+
+               btrfsic_stack_frame_free(sf);
+               sf = prev;
+               goto continue_with_new_stack_frame;
+       } else {
+               BUG_ON(&initial_stack_frame != sf);
+       }
+
+       return sf->error;
+}
+
+static int btrfsic_create_link_to_next_block(
+               struct btrfsic_state *state,
+               struct btrfsic_block *block,
+               struct btrfsic_block_data_ctx *block_ctx,
+               u64 next_bytenr,
+               int limit_nesting,
+               struct btrfsic_block_data_ctx *next_block_ctx,
+               struct btrfsic_block **next_blockp,
+               int force_iodone_flag,
+               int *num_copiesp, int *mirror_nump,
+               struct btrfs_disk_key *disk_key,
+               u64 parent_generation)
+{
+       struct btrfsic_block *next_block = NULL;
+       int ret;
+       struct btrfsic_block_link *l;
+       int did_alloc_block_link;
+       int block_was_created;
+
+       *next_blockp = NULL;
+       if (0 == *num_copiesp) {
+               *num_copiesp =
+                   btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                                    next_bytenr, PAGE_SIZE);
+               if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+                       printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+                              (unsigned long long)next_bytenr, *num_copiesp);
+               *mirror_nump = 1;
+       }
+
+       if (*mirror_nump > *num_copiesp)
+               return 0;
+
+       if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+               printk(KERN_INFO
+                      "btrfsic_create_link_to_next_block(mirror_num=%d)\n",
+                      *mirror_nump);
+       ret = btrfsic_map_block(state, next_bytenr,
+                               BTRFSIC_BLOCK_SIZE,
+                               next_block_ctx, *mirror_nump);
+       if (ret) {
+               printk(KERN_INFO
+                      "btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n",
+                      (unsigned long long)next_bytenr, *mirror_nump);
+               btrfsic_release_block_ctx(next_block_ctx);
+               *next_blockp = NULL;
+               return -1;
+       }
+
+       next_block = btrfsic_block_lookup_or_add(state,
+                                                next_block_ctx, "referenced ",
+                                                1, force_iodone_flag,
+                                                !force_iodone_flag,
+                                                *mirror_nump,
+                                                &block_was_created);
+       if (NULL == next_block) {
+               btrfsic_release_block_ctx(next_block_ctx);
+               *next_blockp = NULL;
+               return -1;
+       }
+       if (block_was_created) {
+               l = NULL;
+               next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
+       } else {
+               if (next_block->logical_bytenr != next_bytenr &&
+                   !(!next_block->is_metadata &&
+                     0 == next_block->logical_bytenr)) {
+                       printk(KERN_INFO
+                              "Referenced block @%llu (%s/%llu/%d)"
+                              " found in hash table, %c,"
+                              " bytenr mismatch (!= stored %llu).\n",
+                              (unsigned long long)next_bytenr,
+                              next_block_ctx->dev->name,
+                              (unsigned long long)next_block_ctx->dev_bytenr,
+                              *mirror_nump,
+                              btrfsic_get_block_type(state, next_block),
+                              (unsigned long long)next_block->logical_bytenr);
+               } else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                       printk(KERN_INFO
+                              "Referenced block @%llu (%s/%llu/%d)"
+                              " found in hash table, %c.\n",
+                              (unsigned long long)next_bytenr,
+                              next_block_ctx->dev->name,
+                              (unsigned long long)next_block_ctx->dev_bytenr,
+                              *mirror_nump,
+                              btrfsic_get_block_type(state, next_block));
+               next_block->logical_bytenr = next_bytenr;
+
+               next_block->mirror_num = *mirror_nump;
+               l = btrfsic_block_link_hashtable_lookup(
+                               next_block_ctx->dev->bdev,
+                               next_block_ctx->dev_bytenr,
+                               block_ctx->dev->bdev,
+                               block_ctx->dev_bytenr,
+                               &state->block_link_hashtable);
+       }
+
+       next_block->disk_key = *disk_key;
+       if (NULL == l) {
+               l = btrfsic_block_link_alloc();
+               if (NULL == l) {
+                       printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+                       btrfsic_release_block_ctx(next_block_ctx);
+                       *next_blockp = NULL;
+                       return -1;
+               }
+
+               did_alloc_block_link = 1;
+               l->block_ref_to = next_block;
+               l->block_ref_from = block;
+               l->ref_cnt = 1;
+               l->parent_generation = parent_generation;
+
+               if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                       btrfsic_print_add_link(state, l);
+
+               list_add(&l->node_ref_to, &block->ref_to_list);
+               list_add(&l->node_ref_from, &next_block->ref_from_list);
+
+               btrfsic_block_link_hashtable_add(l,
+                                                &state->block_link_hashtable);
+       } else {
+               did_alloc_block_link = 0;
+               if (0 == limit_nesting) {
+                       l->ref_cnt++;
+                       l->parent_generation = parent_generation;
+                       if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                               btrfsic_print_add_link(state, l);
+               }
+       }
+
+       if (limit_nesting > 0 && did_alloc_block_link) {
+               ret = btrfsic_read_block(state, next_block_ctx);
+               if (ret < (int)BTRFSIC_BLOCK_SIZE) {
+                       printk(KERN_INFO
+                              "btrfsic: read block @logical %llu failed!\n",
+                              (unsigned long long)next_bytenr);
+                       btrfsic_release_block_ctx(next_block_ctx);
+                       *next_blockp = NULL;
+                       return -1;
+               }
+
+               *next_blockp = next_block;
+       } else {
+               *next_blockp = NULL;
+       }
+       (*mirror_nump)++;
+
+       return 0;
+}
+
+static int btrfsic_handle_extent_data(
+               struct btrfsic_state *state,
+               struct btrfsic_block *block,
+               struct btrfsic_block_data_ctx *block_ctx,
+               u32 item_offset, int force_iodone_flag)
+{
+       int ret;
+       struct btrfs_file_extent_item *file_extent_item =
+           (struct btrfs_file_extent_item *)(block_ctx->data +
+                                             offsetof(struct btrfs_leaf,
+                                                      items) + item_offset);
+       u64 next_bytenr =
+           le64_to_cpu(file_extent_item->disk_bytenr) +
+           le64_to_cpu(file_extent_item->offset);
+       u64 num_bytes = le64_to_cpu(file_extent_item->num_bytes);
+       u64 generation = le64_to_cpu(file_extent_item->generation);
+       struct btrfsic_block_link *l;
+
+       if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
+               printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu,"
+                      " offset = %llu, num_bytes = %llu\n",
+                      file_extent_item->type,
+                      (unsigned long long)
+                      le64_to_cpu(file_extent_item->disk_bytenr),
+                      (unsigned long long)
+                      le64_to_cpu(file_extent_item->offset),
+                      (unsigned long long)
+                      le64_to_cpu(file_extent_item->num_bytes));
+       if (BTRFS_FILE_EXTENT_REG != file_extent_item->type ||
+           ((u64)0) == le64_to_cpu(file_extent_item->disk_bytenr))
+               return 0;
+       while (num_bytes > 0) {
+               u32 chunk_len;
+               int num_copies;
+               int mirror_num;
+
+               if (num_bytes > BTRFSIC_BLOCK_SIZE)
+                       chunk_len = BTRFSIC_BLOCK_SIZE;
+               else
+                       chunk_len = num_bytes;
+
+               num_copies =
+                   btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                                    next_bytenr, PAGE_SIZE);
+               if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+                       printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+                              (unsigned long long)next_bytenr, num_copies);
+               for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+                       struct btrfsic_block_data_ctx next_block_ctx;
+                       struct btrfsic_block *next_block;
+                       int block_was_created;
+
+                       if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                               printk(KERN_INFO "btrfsic_handle_extent_data("
+                                      "mirror_num=%d)\n", mirror_num);
+                       if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
+                               printk(KERN_INFO
+                                      "\tdisk_bytenr = %llu, num_bytes %u\n",
+                                      (unsigned long long)next_bytenr,
+                                      chunk_len);
+                       ret = btrfsic_map_block(state, next_bytenr,
+                                               chunk_len, &next_block_ctx,
+                                               mirror_num);
+                       if (ret) {
+                               printk(KERN_INFO
+                                      "btrfsic: btrfsic_map_block(@%llu,"
+                                      " mirror=%d) failed!\n",
+                                      (unsigned long long)next_bytenr,
+                                      mirror_num);
+                               return -1;
+                       }
+
+                       next_block = btrfsic_block_lookup_or_add(
+                                       state,
+                                       &next_block_ctx,
+                                       "referenced ",
+                                       0,
+                                       force_iodone_flag,
+                                       !force_iodone_flag,
+                                       mirror_num,
+                                       &block_was_created);
+                       if (NULL == next_block) {
+                               printk(KERN_INFO
+                                      "btrfsic: error, kmalloc failed!\n");
+                               btrfsic_release_block_ctx(&next_block_ctx);
+                               return -1;
+                       }
+                       if (!block_was_created) {
+                               if (next_block->logical_bytenr != next_bytenr &&
+                                   !(!next_block->is_metadata &&
+                                     0 == next_block->logical_bytenr)) {
+                                       printk(KERN_INFO
+                                              "Referenced block"
+                                              " @%llu (%s/%llu/%d)"
+                                              " found in hash table, D,"
+                                              " bytenr mismatch"
+                                              " (!= stored %llu).\n",
+                                              (unsigned long long)next_bytenr,
+                                              next_block_ctx.dev->name,
+                                              (unsigned long long)
+                                              next_block_ctx.dev_bytenr,
+                                              mirror_num,
+                                              (unsigned long long)
+                                              next_block->logical_bytenr);
+                               }
+                               next_block->logical_bytenr = next_bytenr;
+                               next_block->mirror_num = mirror_num;
+                       }
+
+                       l = btrfsic_block_link_lookup_or_add(state,
+                                                            &next_block_ctx,
+                                                            next_block, block,
+                                                            generation);
+                       btrfsic_release_block_ctx(&next_block_ctx);
+                       if (NULL == l)
+                               return -1;
+               }
+
+               next_bytenr += chunk_len;
+               num_bytes -= chunk_len;
+       }
+
+       return 0;
+}
+
+static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
+                            struct btrfsic_block_data_ctx *block_ctx_out,
+                            int mirror_num)
+{
+       int ret;
+       u64 length;
+       struct btrfs_bio *multi = NULL;
+       struct btrfs_device *device;
+
+       length = len;
+       ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ,
+                             bytenr, &length, &multi, mirror_num);
+
+       device = multi->stripes[0].dev;
+       block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev);
+       block_ctx_out->dev_bytenr = multi->stripes[0].physical;
+       block_ctx_out->start = bytenr;
+       block_ctx_out->len = len;
+       block_ctx_out->data = NULL;
+       block_ctx_out->bh = NULL;
+
+       if (0 == ret)
+               kfree(multi);
+       if (NULL == block_ctx_out->dev) {
+               ret = -ENXIO;
+               printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n");
+       }
+
+       return ret;
+}
+
+static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
+                                 u32 len, struct block_device *bdev,
+                                 struct btrfsic_block_data_ctx *block_ctx_out)
+{
+       block_ctx_out->dev = btrfsic_dev_state_lookup(bdev);
+       block_ctx_out->dev_bytenr = bytenr;
+       block_ctx_out->start = bytenr;
+       block_ctx_out->len = len;
+       block_ctx_out->data = NULL;
+       block_ctx_out->bh = NULL;
+       if (NULL != block_ctx_out->dev) {
+               return 0;
+       } else {
+               printk(KERN_INFO "btrfsic: error, cannot lookup dev (#2)!\n");
+               return -ENXIO;
+       }
+}
+
+static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
+{
+       if (NULL != block_ctx->bh) {
+               brelse(block_ctx->bh);
+               block_ctx->bh = NULL;
+       }
+}
+
+static int btrfsic_read_block(struct btrfsic_state *state,
+                             struct btrfsic_block_data_ctx *block_ctx)
+{
+       block_ctx->bh = NULL;
+       if (block_ctx->dev_bytenr & 4095) {
+               printk(KERN_INFO
+                      "btrfsic: read_block() with unaligned bytenr %llu\n",
+                      (unsigned long long)block_ctx->dev_bytenr);
+               return -1;
+       }
+       if (block_ctx->len > 4096) {
+               printk(KERN_INFO
+                      "btrfsic: read_block() with too huge size %d\n",
+                      block_ctx->len);
+               return -1;
+       }
+
+       block_ctx->bh = __bread(block_ctx->dev->bdev,
+                               block_ctx->dev_bytenr >> 12, 4096);
+       if (NULL == block_ctx->bh)
+               return -1;
+       block_ctx->data = block_ctx->bh->b_data;
+
+       return block_ctx->len;
+}
+
+static void btrfsic_dump_database(struct btrfsic_state *state)
+{
+       struct list_head *elem_all;
+
+       BUG_ON(NULL == state);
+
+       printk(KERN_INFO "all_blocks_list:\n");
+       list_for_each(elem_all, &state->all_blocks_list) {
+               const struct btrfsic_block *const b_all =
+                   list_entry(elem_all, struct btrfsic_block,
+                              all_blocks_node);
+               struct list_head *elem_ref_to;
+               struct list_head *elem_ref_from;
+
+               printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n",
+                      btrfsic_get_block_type(state, b_all),
+                      (unsigned long long)b_all->logical_bytenr,
+                      b_all->dev_state->name,
+                      (unsigned long long)b_all->dev_bytenr,
+                      b_all->mirror_num);
+
+               list_for_each(elem_ref_to, &b_all->ref_to_list) {
+                       const struct btrfsic_block_link *const l =
+                           list_entry(elem_ref_to,
+                                      struct btrfsic_block_link,
+                                      node_ref_to);
+
+                       printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
+                              " refers %u* to"
+                              " %c @%llu (%s/%llu/%d)\n",
+                              btrfsic_get_block_type(state, b_all),
+                              (unsigned long long)b_all->logical_bytenr,
+                              b_all->dev_state->name,
+                              (unsigned long long)b_all->dev_bytenr,
+                              b_all->mirror_num,
+                              l->ref_cnt,
+                              btrfsic_get_block_type(state, l->block_ref_to),
+                              (unsigned long long)
+                              l->block_ref_to->logical_bytenr,
+                              l->block_ref_to->dev_state->name,
+                              (unsigned long long)l->block_ref_to->dev_bytenr,
+                              l->block_ref_to->mirror_num);
+               }
+
+               list_for_each(elem_ref_from, &b_all->ref_from_list) {
+                       const struct btrfsic_block_link *const l =
+                           list_entry(elem_ref_from,
+                                      struct btrfsic_block_link,
+                                      node_ref_from);
+
+                       printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
+                              " is ref %u* from"
+                              " %c @%llu (%s/%llu/%d)\n",
+                              btrfsic_get_block_type(state, b_all),
+                              (unsigned long long)b_all->logical_bytenr,
+                              b_all->dev_state->name,
+                              (unsigned long long)b_all->dev_bytenr,
+                              b_all->mirror_num,
+                              l->ref_cnt,
+                              btrfsic_get_block_type(state, l->block_ref_from),
+                              (unsigned long long)
+                              l->block_ref_from->logical_bytenr,
+                              l->block_ref_from->dev_state->name,
+                              (unsigned long long)
+                              l->block_ref_from->dev_bytenr,
+                              l->block_ref_from->mirror_num);
+               }
+
+               printk(KERN_INFO "\n");
+       }
+}
+
+/*
+ * Test whether the disk block contains a tree block (leaf or node)
+ * (note that this test fails for the super block)
+ */
+static int btrfsic_test_for_metadata(struct btrfsic_state *state,
+                                    const u8 *data, unsigned int size)
+{
+       struct btrfs_header *h;
+       u8 csum[BTRFS_CSUM_SIZE];
+       u32 crc = ~(u32)0;
+       int fail = 0;
+       int crc_fail = 0;
+
+       h = (struct btrfs_header *)data;
+
+       if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE))
+               fail++;
+
+       crc = crc32c(crc, data + BTRFS_CSUM_SIZE, PAGE_SIZE - BTRFS_CSUM_SIZE);
+       btrfs_csum_final(crc, csum);
+       if (memcmp(csum, h->csum, state->csum_size))
+               crc_fail++;
+
+       return fail || crc_fail;
+}
+
+static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
+                                         u64 dev_bytenr,
+                                         u8 *mapped_data, unsigned int len,
+                                         struct bio *bio,
+                                         int *bio_is_patched,
+                                         struct buffer_head *bh,
+                                         int submit_bio_bh_rw)
+{
+       int is_metadata;
+       struct btrfsic_block *block;
+       struct btrfsic_block_data_ctx block_ctx;
+       int ret;
+       struct btrfsic_state *state = dev_state->state;
+       struct block_device *bdev = dev_state->bdev;
+
+       WARN_ON(len > PAGE_SIZE);
+       is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_data, len));
+       if (NULL != bio_is_patched)
+               *bio_is_patched = 0;
+
+       block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr,
+                                              &state->block_hashtable);
+       if (NULL != block) {
+               u64 bytenr;
+               struct list_head *elem_ref_to;
+               struct list_head *tmp_ref_to;
+
+               if (block->is_superblock) {
+                       bytenr = le64_to_cpu(((struct btrfs_super_block *)
+                                             mapped_data)->bytenr);
+                       is_metadata = 1;
+                       if (state->print_mask &
+                           BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {
+                               printk(KERN_INFO
+                                      "[before new superblock is written]:\n");
+                               btrfsic_dump_tree_sub(state, block, 0);
+                       }
+               }
+               if (is_metadata) {
+                       if (!block->is_superblock) {
+                               bytenr = le64_to_cpu(((struct btrfs_header *)
+                                                     mapped_data)->bytenr);
+                               btrfsic_cmp_log_and_dev_bytenr(state, bytenr,
+                                                              dev_state,
+                                                              dev_bytenr,
+                                                              mapped_data);
+                       }
+                       if (block->logical_bytenr != bytenr) {
+                               printk(KERN_INFO
+                                      "Written block @%llu (%s/%llu/%d)"
+                                      " found in hash table, %c,"
+                                      " bytenr mismatch"
+                                      " (!= stored %llu).\n",
+                                      (unsigned long long)bytenr,
+                                      dev_state->name,
+                                      (unsigned long long)dev_bytenr,
+                                      block->mirror_num,
+                                      btrfsic_get_block_type(state, block),
+                                      (unsigned long long)
+                                      block->logical_bytenr);
+                               block->logical_bytenr = bytenr;
+                       } else if (state->print_mask &
+                                  BTRFSIC_PRINT_MASK_VERBOSE)
+                               printk(KERN_INFO
+                                      "Written block @%llu (%s/%llu/%d)"
+                                      " found in hash table, %c.\n",
+                                      (unsigned long long)bytenr,
+                                      dev_state->name,
+                                      (unsigned long long)dev_bytenr,
+                                      block->mirror_num,
+                                      btrfsic_get_block_type(state, block));
+               } else {
+                       bytenr = block->logical_bytenr;
+                       if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                               printk(KERN_INFO
+                                      "Written block @%llu (%s/%llu/%d)"
+                                      " found in hash table, %c.\n",
+                                      (unsigned long long)bytenr,
+                                      dev_state->name,
+                                      (unsigned long long)dev_bytenr,
+                                      block->mirror_num,
+                                      btrfsic_get_block_type(state, block));
+               }
+
+               if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                       printk(KERN_INFO
+                              "ref_to_list: %cE, ref_from_list: %cE\n",
+                              list_empty(&block->ref_to_list) ? ' ' : '!',
+                              list_empty(&block->ref_from_list) ? ' ' : '!');
+               if (btrfsic_is_block_ref_by_superblock(state, block, 0)) {
+                       printk(KERN_INFO "btrfs: attempt to overwrite %c-block"
+                              " @%llu (%s/%llu/%d), old(gen=%llu,"
+                              " objectid=%llu, type=%d, offset=%llu),"
+                              " new(gen=%llu),"
+                              " which is referenced by most recent superblock"
+                              " (superblockgen=%llu)!\n",
+                              btrfsic_get_block_type(state, block),
+                              (unsigned long long)bytenr,
+                              dev_state->name,
+                              (unsigned long long)dev_bytenr,
+                              block->mirror_num,
+                              (unsigned long long)block->generation,
+                              (unsigned long long)
+                              le64_to_cpu(block->disk_key.objectid),
+                              block->disk_key.type,
+                              (unsigned long long)
+                              le64_to_cpu(block->disk_key.offset),
+                              (unsigned long long)
+                              le64_to_cpu(((struct btrfs_header *)
+                                           mapped_data)->generation),
+                              (unsigned long long)
+                              state->max_superblock_generation);
+                       btrfsic_dump_tree(state);
+               }
+
+               if (!block->is_iodone && !block->never_written) {
+                       printk(KERN_INFO "btrfs: attempt to overwrite %c-block"
+                              " @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu,"
+                              " which is not yet iodone!\n",
+                              btrfsic_get_block_type(state, block),
+                              (unsigned long long)bytenr,
+                              dev_state->name,
+                              (unsigned long long)dev_bytenr,
+                              block->mirror_num,
+                              (unsigned long long)block->generation,
+                              (unsigned long long)
+                              le64_to_cpu(((struct btrfs_header *)
+                                           mapped_data)->generation));
+                       /* it would not be safe to go on */
+                       btrfsic_dump_tree(state);
+                       return;
+               }
+
+               /*
+                * Clear all references of this block. Do not free
+                * the block itself even if is not referenced anymore
+                * because it still carries valueable information
+                * like whether it was ever written and IO completed.
+                */
+               list_for_each_safe(elem_ref_to, tmp_ref_to,
+                                  &block->ref_to_list) {
+                       struct btrfsic_block_link *const l =
+                           list_entry(elem_ref_to,
+                                      struct btrfsic_block_link,
+                                      node_ref_to);
+
+                       if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                               btrfsic_print_rem_link(state, l);
+                       l->ref_cnt--;
+                       if (0 == l->ref_cnt) {
+                               list_del(&l->node_ref_to);
+                               list_del(&l->node_ref_from);
+                               btrfsic_block_link_hashtable_remove(l);
+                               btrfsic_block_link_free(l);
+                       }
+               }
+
+               if (block->is_superblock)
+                       ret = btrfsic_map_superblock(state, bytenr, len,
+                                                    bdev, &block_ctx);
+               else
+                       ret = btrfsic_map_block(state, bytenr, len,
+                                               &block_ctx, 0);
+               if (ret) {
+                       printk(KERN_INFO
+                              "btrfsic: btrfsic_map_block(root @%llu)"
+                              " failed!\n", (unsigned long long)bytenr);
+                       return;
+               }
+               block_ctx.data = mapped_data;
+               /* the following is required in case of writes to mirrors,
+                * use the same that was used for the lookup */
+               block_ctx.dev = dev_state;
+               block_ctx.dev_bytenr = dev_bytenr;
+
+               if (is_metadata || state->include_extent_data) {
+                       block->never_written = 0;
+                       block->iodone_w_error = 0;
+                       if (NULL != bio) {
+                               block->is_iodone = 0;
+                               BUG_ON(NULL == bio_is_patched);
+                               if (!*bio_is_patched) {
+                                       block->orig_bio_bh_private =
+                                           bio->bi_private;
+                                       block->orig_bio_bh_end_io.bio =
+                                           bio->bi_end_io;
+                                       block->next_in_same_bio = NULL;
+                                       bio->bi_private = block;
+                                       bio->bi_end_io = btrfsic_bio_end_io;
+                                       *bio_is_patched = 1;
+                               } else {
+                                       struct btrfsic_block *chained_block =
+                                           (struct btrfsic_block *)
+                                           bio->bi_private;
+
+                                       BUG_ON(NULL == chained_block);
+                                       block->orig_bio_bh_private =
+                                           chained_block->orig_bio_bh_private;
+                                       block->orig_bio_bh_end_io.bio =
+                                           chained_block->orig_bio_bh_end_io.
+                                           bio;
+                                       block->next_in_same_bio = chained_block;
+                                       bio->bi_private = block;
+                               }
+                       } else if (NULL != bh) {
+                               block->is_iodone = 0;
+                               block->orig_bio_bh_private = bh->b_private;
+                               block->orig_bio_bh_end_io.bh = bh->b_end_io;
+                               block->next_in_same_bio = NULL;
+                               bh->b_private = block;
+                               bh->b_end_io = btrfsic_bh_end_io;
+                       } else {
+                               block->is_iodone = 1;
+                               block->orig_bio_bh_private = NULL;
+                               block->orig_bio_bh_end_io.bio = NULL;
+                               block->next_in_same_bio = NULL;
+                       }
+               }
+
+               block->flush_gen = dev_state->last_flush_gen + 1;
+               block->submit_bio_bh_rw = submit_bio_bh_rw;
+               if (is_metadata) {
+                       block->logical_bytenr = bytenr;
+                       block->is_metadata = 1;
+                       if (block->is_superblock) {
+                               ret = btrfsic_process_written_superblock(
+                                               state,
+                                               block,
+                                               (struct btrfs_super_block *)
+                                               mapped_data);
+                               if (state->print_mask &
+                                   BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) {
+                                       printk(KERN_INFO
+                                       "[after new superblock is written]:\n");
+                                       btrfsic_dump_tree_sub(state, block, 0);
+                               }
+                       } else {
+                               block->mirror_num = 0;  /* unknown */
+                               ret = btrfsic_process_metablock(
+                                               state,
+                                               block,
+                                               &block_ctx,
+                                               (struct btrfs_header *)
+                                               block_ctx.data,
+                                               0, 0);
+                       }
+                       if (ret)
+                               printk(KERN_INFO
+                                      "btrfsic: btrfsic_process_metablock"
+                                      "(root @%llu) failed!\n",
+                                      (unsigned long long)dev_bytenr);
+               } else {
+                       block->is_metadata = 0;
+                       block->mirror_num = 0;  /* unknown */
+                       block->generation = BTRFSIC_GENERATION_UNKNOWN;
+                       if (!state->include_extent_data
+                           && list_empty(&block->ref_from_list)) {
+                               /*
+                                * disk block is overwritten with extent
+                                * data (not meta data) and we are configured
+                                * to not include extent data: take the
+                                * chance and free the block's memory
+                                */
+                               btrfsic_block_hashtable_remove(block);
+                               list_del(&block->all_blocks_node);
+                               btrfsic_block_free(block);
+                       }
+               }
+               btrfsic_release_block_ctx(&block_ctx);
+       } else {
+               /* block has not been found in hash table */
+               u64 bytenr;
+
+               if (!is_metadata) {
+                       if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                               printk(KERN_INFO "Written block (%s/%llu/?)"
+                                      " !found in hash table, D.\n",
+                                      dev_state->name,
+                                      (unsigned long long)dev_bytenr);
+                       if (!state->include_extent_data)
+                               return; /* ignore that written D block */
+
+                       /* this is getting ugly for the
+                        * include_extent_data case... */
+                       bytenr = 0;     /* unknown */
+                       block_ctx.start = bytenr;
+                       block_ctx.len = len;
+                       block_ctx.bh = NULL;
+               } else {
+                       bytenr = le64_to_cpu(((struct btrfs_header *)
+                                             mapped_data)->bytenr);
+                       btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
+                                                      dev_bytenr,
+                                                      mapped_data);
+                       if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                               printk(KERN_INFO
+                                      "Written block @%llu (%s/%llu/?)"
+                                      " !found in hash table, M.\n",
+                                      (unsigned long long)bytenr,
+                                      dev_state->name,
+                                      (unsigned long long)dev_bytenr);
+
+                       ret = btrfsic_map_block(state, bytenr, len, &block_ctx,
+                                               0);
+                       if (ret) {
+                               printk(KERN_INFO
+                                      "btrfsic: btrfsic_map_block(root @%llu)"
+                                      " failed!\n",
+                                      (unsigned long long)dev_bytenr);
+                               return;
+                       }
+               }
+               block_ctx.data = mapped_data;
+               /* the following is required in case of writes to mirrors,
+                * use the same that was used for the lookup */
+               block_ctx.dev = dev_state;
+               block_ctx.dev_bytenr = dev_bytenr;
+
+               block = btrfsic_block_alloc();
+               if (NULL == block) {
+                       printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+                       btrfsic_release_block_ctx(&block_ctx);
+                       return;
+               }
+               block->dev_state = dev_state;
+               block->dev_bytenr = dev_bytenr;
+               block->logical_bytenr = bytenr;
+               block->is_metadata = is_metadata;
+               block->never_written = 0;
+               block->iodone_w_error = 0;
+               block->mirror_num = 0;  /* unknown */
+               block->flush_gen = dev_state->last_flush_gen + 1;
+               block->submit_bio_bh_rw = submit_bio_bh_rw;
+               if (NULL != bio) {
+                       block->is_iodone = 0;
+                       BUG_ON(NULL == bio_is_patched);
+                       if (!*bio_is_patched) {
+                               block->orig_bio_bh_private = bio->bi_private;
+                               block->orig_bio_bh_end_io.bio = bio->bi_end_io;
+                               block->next_in_same_bio = NULL;
+                               bio->bi_private = block;
+                               bio->bi_end_io = btrfsic_bio_end_io;
+                               *bio_is_patched = 1;
+                       } else {
+                               struct btrfsic_block *chained_block =
+                                   (struct btrfsic_block *)
+                                   bio->bi_private;
+
+                               BUG_ON(NULL == chained_block);
+                               block->orig_bio_bh_private =
+                                   chained_block->orig_bio_bh_private;
+                               block->orig_bio_bh_end_io.bio =
+                                   chained_block->orig_bio_bh_end_io.bio;
+                               block->next_in_same_bio = chained_block;
+                               bio->bi_private = block;
+                       }
+               } else if (NULL != bh) {
+                       block->is_iodone = 0;
+                       block->orig_bio_bh_private = bh->b_private;
+                       block->orig_bio_bh_end_io.bh = bh->b_end_io;
+                       block->next_in_same_bio = NULL;
+                       bh->b_private = block;
+                       bh->b_end_io = btrfsic_bh_end_io;
+               } else {
+                       block->is_iodone = 1;
+                       block->orig_bio_bh_private = NULL;
+                       block->orig_bio_bh_end_io.bio = NULL;
+                       block->next_in_same_bio = NULL;
+               }
+               if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                       printk(KERN_INFO
+                              "New written %c-block @%llu (%s/%llu/%d)\n",
+                              is_metadata ? 'M' : 'D',
+                              (unsigned long long)block->logical_bytenr,
+                              block->dev_state->name,
+                              (unsigned long long)block->dev_bytenr,
+                              block->mirror_num);
+               list_add(&block->all_blocks_node, &state->all_blocks_list);
+               btrfsic_block_hashtable_add(block, &state->block_hashtable);
+
+               if (is_metadata) {
+                       ret = btrfsic_process_metablock(state, block,
+                                                       &block_ctx,
+                                                       (struct btrfs_header *)
+                                                       block_ctx.data, 0, 0);
+                       if (ret)
+                               printk(KERN_INFO
+                                      "btrfsic: process_metablock(root @%llu)"
+                                      " failed!\n",
+                                      (unsigned long long)dev_bytenr);
+               }
+               btrfsic_release_block_ctx(&block_ctx);
+       }
+}
+
+static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
+{
+       struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private;
+       int iodone_w_error;
+
+       /* mutex is not held! This is not save if IO is not yet completed
+        * on umount */
+       iodone_w_error = 0;
+       if (bio_error_status)
+               iodone_w_error = 1;
+
+       BUG_ON(NULL == block);
+       bp->bi_private = block->orig_bio_bh_private;
+       bp->bi_end_io = block->orig_bio_bh_end_io.bio;
+
+       do {
+               struct btrfsic_block *next_block;
+               struct btrfsic_dev_state *const dev_state = block->dev_state;
+
+               if ((dev_state->state->print_mask &
+                    BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+                       printk(KERN_INFO
+                              "bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n",
+                              bio_error_status,
+                              btrfsic_get_block_type(dev_state->state, block),
+                              (unsigned long long)block->logical_bytenr,
+                              dev_state->name,
+                              (unsigned long long)block->dev_bytenr,
+                              block->mirror_num);
+               next_block = block->next_in_same_bio;
+               block->iodone_w_error = iodone_w_error;
+               if (block->submit_bio_bh_rw & REQ_FLUSH) {
+                       dev_state->last_flush_gen++;
+                       if ((dev_state->state->print_mask &
+                            BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+                               printk(KERN_INFO
+                                      "bio_end_io() new %s flush_gen=%llu\n",
+                                      dev_state->name,
+                                      (unsigned long long)
+                                      dev_state->last_flush_gen);
+               }
+               if (block->submit_bio_bh_rw & REQ_FUA)
+                       block->flush_gen = 0; /* FUA completed means block is
+                                              * on disk */
+               block->is_iodone = 1; /* for FLUSH, this releases the block */
+               block = next_block;
+       } while (NULL != block);
+
+       bp->bi_end_io(bp, bio_error_status);
+}
+
+static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate)
+{
+       struct btrfsic_block *block = (struct btrfsic_block *)bh->b_private;
+       int iodone_w_error = !uptodate;
+       struct btrfsic_dev_state *dev_state;
+
+       BUG_ON(NULL == block);
+       dev_state = block->dev_state;
+       if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+               printk(KERN_INFO
+                      "bh_end_io(error=%d) for %c @%llu (%s/%llu/%d)\n",
+                      iodone_w_error,
+                      btrfsic_get_block_type(dev_state->state, block),
+                      (unsigned long long)block->logical_bytenr,
+                      block->dev_state->name,
+                      (unsigned long long)block->dev_bytenr,
+                      block->mirror_num);
+
+       block->iodone_w_error = iodone_w_error;
+       if (block->submit_bio_bh_rw & REQ_FLUSH) {
+               dev_state->last_flush_gen++;
+               if ((dev_state->state->print_mask &
+                    BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
+                       printk(KERN_INFO
+                              "bh_end_io() new %s flush_gen=%llu\n",
+                              dev_state->name,
+                              (unsigned long long)dev_state->last_flush_gen);
+       }
+       if (block->submit_bio_bh_rw & REQ_FUA)
+               block->flush_gen = 0; /* FUA completed means block is on disk */
+
+       bh->b_private = block->orig_bio_bh_private;
+       bh->b_end_io = block->orig_bio_bh_end_io.bh;
+       block->is_iodone = 1; /* for FLUSH, this releases the block */
+       bh->b_end_io(bh, uptodate);
+}
+
+static int btrfsic_process_written_superblock(
+               struct btrfsic_state *state,
+               struct btrfsic_block *const superblock,
+               struct btrfs_super_block *const super_hdr)
+{
+       int pass;
+
+       superblock->generation = btrfs_super_generation(super_hdr);
+       if (!(superblock->generation > state->max_superblock_generation ||
+             0 == state->max_superblock_generation)) {
+               if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+                       printk(KERN_INFO
+                              "btrfsic: superblock @%llu (%s/%llu/%d)"
+                              " with old gen %llu <= %llu\n",
+                              (unsigned long long)superblock->logical_bytenr,
+                              superblock->dev_state->name,
+                              (unsigned long long)superblock->dev_bytenr,
+                              superblock->mirror_num,
+                              (unsigned long long)
+                              btrfs_super_generation(super_hdr),
+                              (unsigned long long)
+                              state->max_superblock_generation);
+       } else {
+               if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
+                       printk(KERN_INFO
+                              "btrfsic: got new superblock @%llu (%s/%llu/%d)"
+                              " with new gen %llu > %llu\n",
+                              (unsigned long long)superblock->logical_bytenr,
+                              superblock->dev_state->name,
+                              (unsigned long long)superblock->dev_bytenr,
+                              superblock->mirror_num,
+                              (unsigned long long)
+                              btrfs_super_generation(super_hdr),
+                              (unsigned long long)
+                              state->max_superblock_generation);
+
+               state->max_superblock_generation =
+                   btrfs_super_generation(super_hdr);
+               state->latest_superblock = superblock;
+       }
+
+       for (pass = 0; pass < 3; pass++) {
+               int ret;
+               u64 next_bytenr;
+               struct btrfsic_block *next_block;
+               struct btrfsic_block_data_ctx tmp_next_block_ctx;
+               struct btrfsic_block_link *l;
+               int num_copies;
+               int mirror_num;
+               const char *additional_string = NULL;
+               struct btrfs_disk_key tmp_disk_key;
+
+               tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
+               tmp_disk_key.offset = 0;
+
+               switch (pass) {
+               case 0:
+                       tmp_disk_key.objectid =
+                           cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID);
+                       additional_string = "root ";
+                       next_bytenr = btrfs_super_root(super_hdr);
+                       if (state->print_mask &
+                           BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+                               printk(KERN_INFO "root@%llu\n",
+                                      (unsigned long long)next_bytenr);
+                       break;
+               case 1:
+                       tmp_disk_key.objectid =
+                           cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID);
+                       additional_string = "chunk ";
+                       next_bytenr = btrfs_super_chunk_root(super_hdr);
+                       if (state->print_mask &
+                           BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+                               printk(KERN_INFO "chunk@%llu\n",
+                                      (unsigned long long)next_bytenr);
+                       break;
+               case 2:
+                       tmp_disk_key.objectid =
+                           cpu_to_le64(BTRFS_TREE_LOG_OBJECTID);
+                       additional_string = "log ";
+                       next_bytenr = btrfs_super_log_root(super_hdr);
+                       if (0 == next_bytenr)
+                               continue;
+                       if (state->print_mask &
+                           BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
+                               printk(KERN_INFO "log@%llu\n",
+                                      (unsigned long long)next_bytenr);
+                       break;
+               }
+
+               num_copies =
+                   btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                                    next_bytenr, PAGE_SIZE);
+               if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
+                       printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
+                              (unsigned long long)next_bytenr, num_copies);
+               for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+                       int was_created;
+
+                       if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                               printk(KERN_INFO
+                                      "btrfsic_process_written_superblock("
+                                      "mirror_num=%d)\n", mirror_num);
+                       ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+                                               &tmp_next_block_ctx,
+                                               mirror_num);
+                       if (ret) {
+                               printk(KERN_INFO
+                                      "btrfsic: btrfsic_map_block(@%llu,"
+                                      " mirror=%d) failed!\n",
+                                      (unsigned long long)next_bytenr,
+                                      mirror_num);
+                               return -1;
+                       }
+
+                       next_block = btrfsic_block_lookup_or_add(
+                                       state,
+                                       &tmp_next_block_ctx,
+                                       additional_string,
+                                       1, 0, 1,
+                                       mirror_num,
+                                       &was_created);
+                       if (NULL == next_block) {
+                               printk(KERN_INFO
+                                      "btrfsic: error, kmalloc failed!\n");
+                               btrfsic_release_block_ctx(&tmp_next_block_ctx);
+                               return -1;
+                       }
+
+                       next_block->disk_key = tmp_disk_key;
+                       if (was_created)
+                               next_block->generation =
+                                   BTRFSIC_GENERATION_UNKNOWN;
+                       l = btrfsic_block_link_lookup_or_add(
+                                       state,
+                                       &tmp_next_block_ctx,
+                                       next_block,
+                                       superblock,
+                                       BTRFSIC_GENERATION_UNKNOWN);
+                       btrfsic_release_block_ctx(&tmp_next_block_ctx);
+                       if (NULL == l)
+                               return -1;
+               }
+       }
+
+       if (-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)) {
+               WARN_ON(1);
+               btrfsic_dump_tree(state);
+       }
+
+       return 0;
+}
+
+static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
+                                       struct btrfsic_block *const block,
+                                       int recursion_level)
+{
+       struct list_head *elem_ref_to;
+       int ret = 0;
+
+       if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
+               /*
+                * Note that this situation can happen and does not
+                * indicate an error in regular cases. It happens
+                * when disk blocks are freed and later reused.
+                * The check-integrity module is not aware of any
+                * block free operations, it just recognizes block
+                * write operations. Therefore it keeps the linkage
+                * information for a block until a block is
+                * rewritten. This can temporarily cause incorrect
+                * and even circular linkage informations. This
+                * causes no harm unless such blocks are referenced
+                * by the most recent super block.
+                */
+               if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                       printk(KERN_INFO
+                              "btrfsic: abort cyclic linkage (case 1).\n");
+
+               return ret;
+       }
+
+       /*
+        * This algorithm is recursive because the amount of used stack
+        * space is very small and the max recursion depth is limited.
+        */
+       list_for_each(elem_ref_to, &block->ref_to_list) {
+               const struct btrfsic_block_link *const l =
+                   list_entry(elem_ref_to, struct btrfsic_block_link,
+                              node_ref_to);
+
+               if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                       printk(KERN_INFO
+                              "rl=%d, %c @%llu (%s/%llu/%d)"
+                              " %u* refers to %c @%llu (%s/%llu/%d)\n",
+                              recursion_level,
+                              btrfsic_get_block_type(state, block),
+                              (unsigned long long)block->logical_bytenr,
+                              block->dev_state->name,
+                              (unsigned long long)block->dev_bytenr,
+                              block->mirror_num,
+                              l->ref_cnt,
+                              btrfsic_get_block_type(state, l->block_ref_to),
+                              (unsigned long long)
+                              l->block_ref_to->logical_bytenr,
+                              l->block_ref_to->dev_state->name,
+                              (unsigned long long)l->block_ref_to->dev_bytenr,
+                              l->block_ref_to->mirror_num);
+               if (l->block_ref_to->never_written) {
+                       printk(KERN_INFO "btrfs: attempt to write superblock"
+                              " which references block %c @%llu (%s/%llu/%d)"
+                              " which is never written!\n",
+                              btrfsic_get_block_type(state, l->block_ref_to),
+                              (unsigned long long)
+                              l->block_ref_to->logical_bytenr,
+                              l->block_ref_to->dev_state->name,
+                              (unsigned long long)l->block_ref_to->dev_bytenr,
+                              l->block_ref_to->mirror_num);
+                       ret = -1;
+               } else if (!l->block_ref_to->is_iodone) {
+                       printk(KERN_INFO "btrfs: attempt to write superblock"
+                              " which references block %c @%llu (%s/%llu/%d)"
+                              " which is not yet iodone!\n",
+                              btrfsic_get_block_type(state, l->block_ref_to),
+                              (unsigned long long)
+                              l->block_ref_to->logical_bytenr,
+                              l->block_ref_to->dev_state->name,
+                              (unsigned long long)l->block_ref_to->dev_bytenr,
+                              l->block_ref_to->mirror_num);
+                       ret = -1;
+               } else if (l->parent_generation !=
+                          l->block_ref_to->generation &&
+                          BTRFSIC_GENERATION_UNKNOWN !=
+                          l->parent_generation &&
+                          BTRFSIC_GENERATION_UNKNOWN !=
+                          l->block_ref_to->generation) {
+                       printk(KERN_INFO "btrfs: attempt to write superblock"
+                              " which references block %c @%llu (%s/%llu/%d)"
+                              " with generation %llu !="
+                              " parent generation %llu!\n",
+                              btrfsic_get_block_type(state, l->block_ref_to),
+                              (unsigned long long)
+                              l->block_ref_to->logical_bytenr,
+                              l->block_ref_to->dev_state->name,
+                              (unsigned long long)l->block_ref_to->dev_bytenr,
+                              l->block_ref_to->mirror_num,
+                              (unsigned long long)l->block_ref_to->generation,
+                              (unsigned long long)l->parent_generation);
+                       ret = -1;
+               } else if (l->block_ref_to->flush_gen >
+                          l->block_ref_to->dev_state->last_flush_gen) {
+                       printk(KERN_INFO "btrfs: attempt to write superblock"
+                              " which references block %c @%llu (%s/%llu/%d)"
+                              " which is not flushed out of disk's write cache"
+                              " (block flush_gen=%llu,"
+                              " dev->flush_gen=%llu)!\n",
+                              btrfsic_get_block_type(state, l->block_ref_to),
+                              (unsigned long long)
+                              l->block_ref_to->logical_bytenr,
+                              l->block_ref_to->dev_state->name,
+                              (unsigned long long)l->block_ref_to->dev_bytenr,
+                              l->block_ref_to->mirror_num,
+                              (unsigned long long)block->flush_gen,
+                              (unsigned long long)
+                              l->block_ref_to->dev_state->last_flush_gen);
+                       ret = -1;
+               } else if (-1 == btrfsic_check_all_ref_blocks(state,
+                                                             l->block_ref_to,
+                                                             recursion_level +
+                                                             1)) {
+                       ret = -1;
+               }
+       }
+
+       return ret;
+}
+
+static int btrfsic_is_block_ref_by_superblock(
+               const struct btrfsic_state *state,
+               const struct btrfsic_block *block,
+               int recursion_level)
+{
+       struct list_head *elem_ref_from;
+
+       if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
+               /* refer to comment at "abort cyclic linkage (case 1)" */
+               if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                       printk(KERN_INFO
+                              "btrfsic: abort cyclic linkage (case 2).\n");
+
+               return 0;
+       }
+
+       /*
+        * This algorithm is recursive because the amount of used stack space
+        * is very small and the max recursion depth is limited.
+        */
+       list_for_each(elem_ref_from, &block->ref_from_list) {
+               const struct btrfsic_block_link *const l =
+                   list_entry(elem_ref_from, struct btrfsic_block_link,
+                              node_ref_from);
+
+               if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                       printk(KERN_INFO
+                              "rl=%d, %c @%llu (%s/%llu/%d)"
+                              " is ref %u* from %c @%llu (%s/%llu/%d)\n",
+                              recursion_level,
+                              btrfsic_get_block_type(state, block),
+                              (unsigned long long)block->logical_bytenr,
+                              block->dev_state->name,
+                              (unsigned long long)block->dev_bytenr,
+                              block->mirror_num,
+                              l->ref_cnt,
+                              btrfsic_get_block_type(state, l->block_ref_from),
+                              (unsigned long long)
+                              l->block_ref_from->logical_bytenr,
+                              l->block_ref_from->dev_state->name,
+                              (unsigned long long)
+                              l->block_ref_from->dev_bytenr,
+                              l->block_ref_from->mirror_num);
+               if (l->block_ref_from->is_superblock &&
+                   state->latest_superblock->dev_bytenr ==
+                   l->block_ref_from->dev_bytenr &&
+                   state->latest_superblock->dev_state->bdev ==
+                   l->block_ref_from->dev_state->bdev)
+                       return 1;
+               else if (btrfsic_is_block_ref_by_superblock(state,
+                                                           l->block_ref_from,
+                                                           recursion_level +
+                                                           1))
+                       return 1;
+       }
+
+       return 0;
+}
+
+static void btrfsic_print_add_link(const struct btrfsic_state *state,
+                                  const struct btrfsic_block_link *l)
+{
+       printk(KERN_INFO
+              "Add %u* link from %c @%llu (%s/%llu/%d)"
+              " to %c @%llu (%s/%llu/%d).\n",
+              l->ref_cnt,
+              btrfsic_get_block_type(state, l->block_ref_from),
+              (unsigned long long)l->block_ref_from->logical_bytenr,
+              l->block_ref_from->dev_state->name,
+              (unsigned long long)l->block_ref_from->dev_bytenr,
+              l->block_ref_from->mirror_num,
+              btrfsic_get_block_type(state, l->block_ref_to),
+              (unsigned long long)l->block_ref_to->logical_bytenr,
+              l->block_ref_to->dev_state->name,
+              (unsigned long long)l->block_ref_to->dev_bytenr,
+              l->block_ref_to->mirror_num);
+}
+
+static void btrfsic_print_rem_link(const struct btrfsic_state *state,
+                                  const struct btrfsic_block_link *l)
+{
+       printk(KERN_INFO
+              "Rem %u* link from %c @%llu (%s/%llu/%d)"
+              " to %c @%llu (%s/%llu/%d).\n",
+              l->ref_cnt,
+              btrfsic_get_block_type(state, l->block_ref_from),
+              (unsigned long long)l->block_ref_from->logical_bytenr,
+              l->block_ref_from->dev_state->name,
+              (unsigned long long)l->block_ref_from->dev_bytenr,
+              l->block_ref_from->mirror_num,
+              btrfsic_get_block_type(state, l->block_ref_to),
+              (unsigned long long)l->block_ref_to->logical_bytenr,
+              l->block_ref_to->dev_state->name,
+              (unsigned long long)l->block_ref_to->dev_bytenr,
+              l->block_ref_to->mirror_num);
+}
+
+static char btrfsic_get_block_type(const struct btrfsic_state *state,
+                                  const struct btrfsic_block *block)
+{
+       if (block->is_superblock &&
+           state->latest_superblock->dev_bytenr == block->dev_bytenr &&
+           state->latest_superblock->dev_state->bdev == block->dev_state->bdev)
+               return 'S';
+       else if (block->is_superblock)
+               return 's';
+       else if (block->is_metadata)
+               return 'M';
+       else
+               return 'D';
+}
+
+static void btrfsic_dump_tree(const struct btrfsic_state *state)
+{
+       btrfsic_dump_tree_sub(state, state->latest_superblock, 0);
+}
+
+static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
+                                 const struct btrfsic_block *block,
+                                 int indent_level)
+{
+       struct list_head *elem_ref_to;
+       int indent_add;
+       static char buf[80];
+       int cursor_position;
+
+       /*
+        * Should better fill an on-stack buffer with a complete line and
+        * dump it at once when it is time to print a newline character.
+        */
+
+       /*
+        * This algorithm is recursive because the amount of used stack space
+        * is very small and the max recursion depth is limited.
+        */
+       indent_add = sprintf(buf, "%c-%llu(%s/%llu/%d)",
+                            btrfsic_get_block_type(state, block),
+                            (unsigned long long)block->logical_bytenr,
+                            block->dev_state->name,
+                            (unsigned long long)block->dev_bytenr,
+                            block->mirror_num);
+       if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
+               printk("[...]\n");
+               return;
+       }
+       printk(buf);
+       indent_level += indent_add;
+       if (list_empty(&block->ref_to_list)) {
+               printk("\n");
+               return;
+       }
+       if (block->mirror_num > 1 &&
+           !(state->print_mask & BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS)) {
+               printk(" [...]\n");
+               return;
+       }
+
+       cursor_position = indent_level;
+       list_for_each(elem_ref_to, &block->ref_to_list) {
+               const struct btrfsic_block_link *const l =
+                   list_entry(elem_ref_to, struct btrfsic_block_link,
+                              node_ref_to);
+
+               while (cursor_position < indent_level) {
+                       printk(" ");
+                       cursor_position++;
+               }
+               if (l->ref_cnt > 1)
+                       indent_add = sprintf(buf, " %d*--> ", l->ref_cnt);
+               else
+                       indent_add = sprintf(buf, " --> ");
+               if (indent_level + indent_add >
+                   BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
+                       printk("[...]\n");
+                       cursor_position = 0;
+                       continue;
+               }
+
+               printk(buf);
+
+               btrfsic_dump_tree_sub(state, l->block_ref_to,
+                                     indent_level + indent_add);
+               cursor_position = 0;
+       }
+}
+
+static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
+               struct btrfsic_state *state,
+               struct btrfsic_block_data_ctx *next_block_ctx,
+               struct btrfsic_block *next_block,
+               struct btrfsic_block *from_block,
+               u64 parent_generation)
+{
+       struct btrfsic_block_link *l;
+
+       l = btrfsic_block_link_hashtable_lookup(next_block_ctx->dev->bdev,
+                                               next_block_ctx->dev_bytenr,
+                                               from_block->dev_state->bdev,
+                                               from_block->dev_bytenr,
+                                               &state->block_link_hashtable);
+       if (NULL == l) {
+               l = btrfsic_block_link_alloc();
+               if (NULL == l) {
+                       printk(KERN_INFO
+                              "btrfsic: error, kmalloc" " failed!\n");
+                       return NULL;
+               }
+
+               l->block_ref_to = next_block;
+               l->block_ref_from = from_block;
+               l->ref_cnt = 1;
+               l->parent_generation = parent_generation;
+
+               if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                       btrfsic_print_add_link(state, l);
+
+               list_add(&l->node_ref_to, &from_block->ref_to_list);
+               list_add(&l->node_ref_from, &next_block->ref_from_list);
+
+               btrfsic_block_link_hashtable_add(l,
+                                                &state->block_link_hashtable);
+       } else {
+               l->ref_cnt++;
+               l->parent_generation = parent_generation;
+               if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                       btrfsic_print_add_link(state, l);
+       }
+
+       return l;
+}
+
+static struct btrfsic_block *btrfsic_block_lookup_or_add(
+               struct btrfsic_state *state,
+               struct btrfsic_block_data_ctx *block_ctx,
+               const char *additional_string,
+               int is_metadata,
+               int is_iodone,
+               int never_written,
+               int mirror_num,
+               int *was_created)
+{
+       struct btrfsic_block *block;
+
+       block = btrfsic_block_hashtable_lookup(block_ctx->dev->bdev,
+                                              block_ctx->dev_bytenr,
+                                              &state->block_hashtable);
+       if (NULL == block) {
+               struct btrfsic_dev_state *dev_state;
+
+               block = btrfsic_block_alloc();
+               if (NULL == block) {
+                       printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
+                       return NULL;
+               }
+               dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev);
+               if (NULL == dev_state) {
+                       printk(KERN_INFO
+                              "btrfsic: error, lookup dev_state failed!\n");
+                       btrfsic_block_free(block);
+                       return NULL;
+               }
+               block->dev_state = dev_state;
+               block->dev_bytenr = block_ctx->dev_bytenr;
+               block->logical_bytenr = block_ctx->start;
+               block->is_metadata = is_metadata;
+               block->is_iodone = is_iodone;
+               block->never_written = never_written;
+               block->mirror_num = mirror_num;
+               if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                       printk(KERN_INFO
+                              "New %s%c-block @%llu (%s/%llu/%d)\n",
+                              additional_string,
+                              btrfsic_get_block_type(state, block),
+                              (unsigned long long)block->logical_bytenr,
+                              dev_state->name,
+                              (unsigned long long)block->dev_bytenr,
+                              mirror_num);
+               list_add(&block->all_blocks_node, &state->all_blocks_list);
+               btrfsic_block_hashtable_add(block, &state->block_hashtable);
+               if (NULL != was_created)
+                       *was_created = 1;
+       } else {
+               if (NULL != was_created)
+                       *was_created = 0;
+       }
+
+       return block;
+}
+
+static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
+                                          u64 bytenr,
+                                          struct btrfsic_dev_state *dev_state,
+                                          u64 dev_bytenr, char *data)
+{
+       int num_copies;
+       int mirror_num;
+       int ret;
+       struct btrfsic_block_data_ctx block_ctx;
+       int match = 0;
+
+       num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree,
+                                     bytenr, PAGE_SIZE);
+
+       for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+               ret = btrfsic_map_block(state, bytenr, PAGE_SIZE,
+                                       &block_ctx, mirror_num);
+               if (ret) {
+                       printk(KERN_INFO "btrfsic:"
+                              " btrfsic_map_block(logical @%llu,"
+                              " mirror %d) failed!\n",
+                              (unsigned long long)bytenr, mirror_num);
+                       continue;
+               }
+
+               if (dev_state->bdev == block_ctx.dev->bdev &&
+                   dev_bytenr == block_ctx.dev_bytenr) {
+                       match++;
+                       btrfsic_release_block_ctx(&block_ctx);
+                       break;
+               }
+               btrfsic_release_block_ctx(&block_ctx);
+       }
+
+       if (!match) {
+               printk(KERN_INFO "btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio,"
+                      " buffer->log_bytenr=%llu, submit_bio(bdev=%s,"
+                      " phys_bytenr=%llu)!\n",
+                      (unsigned long long)bytenr, dev_state->name,
+                      (unsigned long long)dev_bytenr);
+               for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
+                       ret = btrfsic_map_block(state, bytenr, PAGE_SIZE,
+                                               &block_ctx, mirror_num);
+                       if (ret)
+                               continue;
+
+                       printk(KERN_INFO "Read logical bytenr @%llu maps to"
+                              " (%s/%llu/%d)\n",
+                              (unsigned long long)bytenr,
+                              block_ctx.dev->name,
+                              (unsigned long long)block_ctx.dev_bytenr,
+                              mirror_num);
+               }
+               WARN_ON(1);
+       }
+}
+
+static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
+               struct block_device *bdev)
+{
+       struct btrfsic_dev_state *ds;
+
+       ds = btrfsic_dev_state_hashtable_lookup(bdev,
+                                               &btrfsic_dev_state_hashtable);
+       return ds;
+}
+
+int btrfsic_submit_bh(int rw, struct buffer_head *bh)
+{
+       struct btrfsic_dev_state *dev_state;
+
+       if (!btrfsic_is_initialized)
+               return submit_bh(rw, bh);
+
+       mutex_lock(&btrfsic_mutex);
+       /* since btrfsic_submit_bh() might also be called before
+        * btrfsic_mount(), this might return NULL */
+       dev_state = btrfsic_dev_state_lookup(bh->b_bdev);
+
+       /* Only called to write the superblock (incl. FLUSH/FUA) */
+       if (NULL != dev_state &&
+           (rw & WRITE) && bh->b_size > 0) {
+               u64 dev_bytenr;
+
+               dev_bytenr = 4096 * bh->b_blocknr;
+               if (dev_state->state->print_mask &
+                   BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+                       printk(KERN_INFO
+                              "submit_bh(rw=0x%x, blocknr=%lu (bytenr %llu),"
+                              " size=%lu, data=%p, bdev=%p)\n",
+                              rw, bh->b_blocknr,
+                              (unsigned long long)dev_bytenr, bh->b_size,
+                              bh->b_data, bh->b_bdev);
+               btrfsic_process_written_block(dev_state, dev_bytenr,
+                                             bh->b_data, bh->b_size, NULL,
+                                             NULL, bh, rw);
+       } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
+               if (dev_state->state->print_mask &
+                   BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+                       printk(KERN_INFO
+                              "submit_bh(rw=0x%x) FLUSH, bdev=%p)\n",
+                              rw, bh->b_bdev);
+               if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
+                       if ((dev_state->state->print_mask &
+                            (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+                             BTRFSIC_PRINT_MASK_VERBOSE)))
+                               printk(KERN_INFO
+                                      "btrfsic_submit_bh(%s) with FLUSH"
+                                      " but dummy block already in use"
+                                      " (ignored)!\n",
+                                      dev_state->name);
+               } else {
+                       struct btrfsic_block *const block =
+                               &dev_state->dummy_block_for_bio_bh_flush;
+
+                       block->is_iodone = 0;
+                       block->never_written = 0;
+                       block->iodone_w_error = 0;
+                       block->flush_gen = dev_state->last_flush_gen + 1;
+                       block->submit_bio_bh_rw = rw;
+                       block->orig_bio_bh_private = bh->b_private;
+                       block->orig_bio_bh_end_io.bh = bh->b_end_io;
+                       block->next_in_same_bio = NULL;
+                       bh->b_private = block;
+                       bh->b_end_io = btrfsic_bh_end_io;
+               }
+       }
+       mutex_unlock(&btrfsic_mutex);
+       return submit_bh(rw, bh);
+}
+
+void btrfsic_submit_bio(int rw, struct bio *bio)
+{
+       struct btrfsic_dev_state *dev_state;
+
+       if (!btrfsic_is_initialized) {
+               submit_bio(rw, bio);
+               return;
+       }
+
+       mutex_lock(&btrfsic_mutex);
+       /* since btrfsic_submit_bio() is also called before
+        * btrfsic_mount(), this might return NULL */
+       dev_state = btrfsic_dev_state_lookup(bio->bi_bdev);
+       if (NULL != dev_state &&
+           (rw & WRITE) && NULL != bio->bi_io_vec) {
+               unsigned int i;
+               u64 dev_bytenr;
+               int bio_is_patched;
+
+               dev_bytenr = 512 * bio->bi_sector;
+               bio_is_patched = 0;
+               if (dev_state->state->print_mask &
+                   BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+                       printk(KERN_INFO
+                              "submit_bio(rw=0x%x, bi_vcnt=%u,"
+                              " bi_sector=%lu (bytenr %llu), bi_bdev=%p)\n",
+                              rw, bio->bi_vcnt, bio->bi_sector,
+                              (unsigned long long)dev_bytenr,
+                              bio->bi_bdev);
+
+               for (i = 0; i < bio->bi_vcnt; i++) {
+                       u8 *mapped_data;
+
+                       mapped_data = kmap(bio->bi_io_vec[i].bv_page);
+                       if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+                            BTRFSIC_PRINT_MASK_VERBOSE) ==
+                           (dev_state->state->print_mask &
+                            (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+                             BTRFSIC_PRINT_MASK_VERBOSE)))
+                               printk(KERN_INFO
+                                      "#%u: page=%p, mapped=%p, len=%u,"
+                                      " offset=%u\n",
+                                      i, bio->bi_io_vec[i].bv_page,
+                                      mapped_data,
+                                      bio->bi_io_vec[i].bv_len,
+                                      bio->bi_io_vec[i].bv_offset);
+                       btrfsic_process_written_block(dev_state, dev_bytenr,
+                                                     mapped_data,
+                                                     bio->bi_io_vec[i].bv_len,
+                                                     bio, &bio_is_patched,
+                                                     NULL, rw);
+                       kunmap(bio->bi_io_vec[i].bv_page);
+                       dev_bytenr += bio->bi_io_vec[i].bv_len;
+               }
+       } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
+               if (dev_state->state->print_mask &
+                   BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
+                       printk(KERN_INFO
+                              "submit_bio(rw=0x%x) FLUSH, bdev=%p)\n",
+                              rw, bio->bi_bdev);
+               if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
+                       if ((dev_state->state->print_mask &
+                            (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
+                             BTRFSIC_PRINT_MASK_VERBOSE)))
+                               printk(KERN_INFO
+                                      "btrfsic_submit_bio(%s) with FLUSH"
+                                      " but dummy block already in use"
+                                      " (ignored)!\n",
+                                      dev_state->name);
+               } else {
+                       struct btrfsic_block *const block =
+                               &dev_state->dummy_block_for_bio_bh_flush;
+
+                       block->is_iodone = 0;
+                       block->never_written = 0;
+                       block->iodone_w_error = 0;
+                       block->flush_gen = dev_state->last_flush_gen + 1;
+                       block->submit_bio_bh_rw = rw;
+                       block->orig_bio_bh_private = bio->bi_private;
+                       block->orig_bio_bh_end_io.bio = bio->bi_end_io;
+                       block->next_in_same_bio = NULL;
+                       bio->bi_private = block;
+                       bio->bi_end_io = btrfsic_bio_end_io;
+               }
+       }
+       mutex_unlock(&btrfsic_mutex);
+
+       submit_bio(rw, bio);
+}
+
+int btrfsic_mount(struct btrfs_root *root,
+                 struct btrfs_fs_devices *fs_devices,
+                 int including_extent_data, u32 print_mask)
+{
+       int ret;
+       struct btrfsic_state *state;
+       struct list_head *dev_head = &fs_devices->devices;
+       struct btrfs_device *device;
+
+       state = kzalloc(sizeof(*state), GFP_NOFS);
+       if (NULL == state) {
+               printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n");
+               return -1;
+       }
+
+       if (!btrfsic_is_initialized) {
+               mutex_init(&btrfsic_mutex);
+               btrfsic_dev_state_hashtable_init(&btrfsic_dev_state_hashtable);
+               btrfsic_is_initialized = 1;
+       }
+       mutex_lock(&btrfsic_mutex);
+       state->root = root;
+       state->print_mask = print_mask;
+       state->include_extent_data = including_extent_data;
+       state->csum_size = 0;
+       INIT_LIST_HEAD(&state->all_blocks_list);
+       btrfsic_block_hashtable_init(&state->block_hashtable);
+       btrfsic_block_link_hashtable_init(&state->block_link_hashtable);
+       state->max_superblock_generation = 0;
+       state->latest_superblock = NULL;
+
+       list_for_each_entry(device, dev_head, dev_list) {
+               struct btrfsic_dev_state *ds;
+               char *p;
+
+               if (!device->bdev || !device->name)
+                       continue;
+
+               ds = btrfsic_dev_state_alloc();
+               if (NULL == ds) {
+                       printk(KERN_INFO
+                              "btrfs check-integrity: kmalloc() failed!\n");
+                       mutex_unlock(&btrfsic_mutex);
+                       return -1;
+               }
+               ds->bdev = device->bdev;
+               ds->state = state;
+               bdevname(ds->bdev, ds->name);
+               ds->name[BDEVNAME_SIZE - 1] = '\0';
+               for (p = ds->name; *p != '\0'; p++);
+               while (p > ds->name && *p != '/')
+                       p--;
+               if (*p == '/')
+                       p++;
+               strlcpy(ds->name, p, sizeof(ds->name));
+               btrfsic_dev_state_hashtable_add(ds,
+                                               &btrfsic_dev_state_hashtable);
+       }
+
+       ret = btrfsic_process_superblock(state, fs_devices);
+       if (0 != ret) {
+               mutex_unlock(&btrfsic_mutex);
+               btrfsic_unmount(root, fs_devices);
+               return ret;
+       }
+
+       if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_DATABASE)
+               btrfsic_dump_database(state);
+       if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_TREE)
+               btrfsic_dump_tree(state);
+
+       mutex_unlock(&btrfsic_mutex);
+       return 0;
+}
+
+void btrfsic_unmount(struct btrfs_root *root,
+                    struct btrfs_fs_devices *fs_devices)
+{
+       struct list_head *elem_all;
+       struct list_head *tmp_all;
+       struct btrfsic_state *state;
+       struct list_head *dev_head = &fs_devices->devices;
+       struct btrfs_device *device;
+
+       if (!btrfsic_is_initialized)
+               return;
+
+       mutex_lock(&btrfsic_mutex);
+
+       state = NULL;
+       list_for_each_entry(device, dev_head, dev_list) {
+               struct btrfsic_dev_state *ds;
+
+               if (!device->bdev || !device->name)
+                       continue;
+
+               ds = btrfsic_dev_state_hashtable_lookup(
+                               device->bdev,
+                               &btrfsic_dev_state_hashtable);
+               if (NULL != ds) {
+                       state = ds->state;
+                       btrfsic_dev_state_hashtable_remove(ds);
+                       btrfsic_dev_state_free(ds);
+               }
+       }
+
+       if (NULL == state) {
+               printk(KERN_INFO
+                      "btrfsic: error, cannot find state information"
+                      " on umount!\n");
+               mutex_unlock(&btrfsic_mutex);
+               return;
+       }
+
+       /*
+        * Don't care about keeping the lists' state up to date,
+        * just free all memory that was allocated dynamically.
+        * Free the blocks and the block_links.
+        */
+       list_for_each_safe(elem_all, tmp_all, &state->all_blocks_list) {
+               struct btrfsic_block *const b_all =
+                   list_entry(elem_all, struct btrfsic_block,
+                              all_blocks_node);
+               struct list_head *elem_ref_to;
+               struct list_head *tmp_ref_to;
+
+               list_for_each_safe(elem_ref_to, tmp_ref_to,
+                                  &b_all->ref_to_list) {
+                       struct btrfsic_block_link *const l =
+                           list_entry(elem_ref_to,
+                                      struct btrfsic_block_link,
+                                      node_ref_to);
+
+                       if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
+                               btrfsic_print_rem_link(state, l);
+
+                       l->ref_cnt--;
+                       if (0 == l->ref_cnt)
+                               btrfsic_block_link_free(l);
+               }
+
+               if (b_all->is_iodone)
+                       btrfsic_block_free(b_all);
+               else
+                       printk(KERN_INFO "btrfs: attempt to free %c-block"
+                              " @%llu (%s/%llu/%d) on umount which is"
+                              " not yet iodone!\n",
+                              btrfsic_get_block_type(state, b_all),
+                              (unsigned long long)b_all->logical_bytenr,
+                              b_all->dev_state->name,
+                              (unsigned long long)b_all->dev_bytenr,
+                              b_all->mirror_num);
+       }
+
+       mutex_unlock(&btrfsic_mutex);
+
+       kfree(state);
+}
diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h
new file mode 100644 (file)
index 0000000..8b59175
--- /dev/null
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) STRATO AG 2011.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#if !defined(__BTRFS_CHECK_INTEGRITY__)
+#define __BTRFS_CHECK_INTEGRITY__
+
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+int btrfsic_submit_bh(int rw, struct buffer_head *bh);
+void btrfsic_submit_bio(int rw, struct bio *bio);
+#else
+#define btrfsic_submit_bh submit_bh
+#define btrfsic_submit_bio submit_bio
+#endif
+
+int btrfsic_mount(struct btrfs_root *root,
+                 struct btrfs_fs_devices *fs_devices,
+                 int including_extent_data, u32 print_mask);
+void btrfsic_unmount(struct btrfs_root *root,
+                    struct btrfs_fs_devices *fs_devices);
+
+#endif
index dede441bdeee2678225187bece170710abe1a9b4..0639a555e16ed1975702ed5509dc9bc1c4dbf490 100644 (file)
@@ -240,7 +240,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 
        cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
                                     new_root_objectid, &disk_key, level,
-                                    buf->start, 0);
+                                    buf->start, 0, 1);
        if (IS_ERR(cow))
                return PTR_ERR(cow);
 
@@ -261,9 +261,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 
        WARN_ON(btrfs_header_generation(buf) > trans->transid);
        if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
-               ret = btrfs_inc_ref(trans, root, cow, 1);
+               ret = btrfs_inc_ref(trans, root, cow, 1, 1);
        else
-               ret = btrfs_inc_ref(trans, root, cow, 0);
+               ret = btrfs_inc_ref(trans, root, cow, 0, 1);
 
        if (ret)
                return ret;
@@ -350,14 +350,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
                if ((owner == root->root_key.objectid ||
                     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
                    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
-                       ret = btrfs_inc_ref(trans, root, buf, 1);
+                       ret = btrfs_inc_ref(trans, root, buf, 1, 1);
                        BUG_ON(ret);
 
                        if (root->root_key.objectid ==
                            BTRFS_TREE_RELOC_OBJECTID) {
-                               ret = btrfs_dec_ref(trans, root, buf, 0);
+                               ret = btrfs_dec_ref(trans, root, buf, 0, 1);
                                BUG_ON(ret);
-                               ret = btrfs_inc_ref(trans, root, cow, 1);
+                               ret = btrfs_inc_ref(trans, root, cow, 1, 1);
                                BUG_ON(ret);
                        }
                        new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
@@ -365,9 +365,9 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 
                        if (root->root_key.objectid ==
                            BTRFS_TREE_RELOC_OBJECTID)
-                               ret = btrfs_inc_ref(trans, root, cow, 1);
+                               ret = btrfs_inc_ref(trans, root, cow, 1, 1);
                        else
-                               ret = btrfs_inc_ref(trans, root, cow, 0);
+                               ret = btrfs_inc_ref(trans, root, cow, 0, 1);
                        BUG_ON(ret);
                }
                if (new_flags != 0) {
@@ -381,11 +381,11 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
                if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
                        if (root->root_key.objectid ==
                            BTRFS_TREE_RELOC_OBJECTID)
-                               ret = btrfs_inc_ref(trans, root, cow, 1);
+                               ret = btrfs_inc_ref(trans, root, cow, 1, 1);
                        else
-                               ret = btrfs_inc_ref(trans, root, cow, 0);
+                               ret = btrfs_inc_ref(trans, root, cow, 0, 1);
                        BUG_ON(ret);
-                       ret = btrfs_dec_ref(trans, root, buf, 1);
+                       ret = btrfs_dec_ref(trans, root, buf, 1, 1);
                        BUG_ON(ret);
                }
                clean_tree_block(trans, root, buf);
@@ -446,7 +446,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 
        cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
                                     root->root_key.objectid, &disk_key,
-                                    level, search_start, empty_size);
+                                    level, search_start, empty_size, 1);
        if (IS_ERR(cow))
                return PTR_ERR(cow);
 
@@ -484,7 +484,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                rcu_assign_pointer(root->node, cow);
 
                btrfs_free_tree_block(trans, root, buf, parent_start,
-                                     last_ref);
+                                     last_ref, 1);
                free_extent_buffer(buf);
                add_root_to_dirty_list(root);
        } else {
@@ -500,7 +500,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                                              trans->transid);
                btrfs_mark_buffer_dirty(parent);
                btrfs_free_tree_block(trans, root, buf, parent_start,
-                                     last_ref);
+                                     last_ref, 1);
        }
        if (unlock_orig)
                btrfs_tree_unlock(buf);
@@ -957,7 +957,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                free_extent_buffer(mid);
 
                root_sub_used(root, mid->len);
-               btrfs_free_tree_block(trans, root, mid, 0, 1);
+               btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
                /* once for the root ptr */
                free_extent_buffer(mid);
                return 0;
@@ -1015,7 +1015,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                        if (wret)
                                ret = wret;
                        root_sub_used(root, right->len);
-                       btrfs_free_tree_block(trans, root, right, 0, 1);
+                       btrfs_free_tree_block(trans, root, right, 0, 1, 0);
                        free_extent_buffer(right);
                        right = NULL;
                } else {
@@ -1055,7 +1055,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                if (wret)
                        ret = wret;
                root_sub_used(root, mid->len);
-               btrfs_free_tree_block(trans, root, mid, 0, 1);
+               btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
                free_extent_buffer(mid);
                mid = NULL;
        } else {
@@ -2089,7 +2089,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 
        c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
                                   root->root_key.objectid, &lower_key,
-                                  level, root->node->start, 0);
+                                  level, root->node->start, 0, 0);
        if (IS_ERR(c))
                return PTR_ERR(c);
 
@@ -2216,7 +2216,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 
        split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
                                        root->root_key.objectid,
-                                       &disk_key, level, c->start, 0);
+                                       &disk_key, level, c->start, 0, 0);
        if (IS_ERR(split))
                return PTR_ERR(split);
 
@@ -2970,7 +2970,7 @@ again:
 
        right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
                                        root->root_key.objectid,
-                                       &disk_key, 0, l->start, 0);
+                                       &disk_key, 0, l->start, 0, 0);
        if (IS_ERR(right))
                return PTR_ERR(right);
 
@@ -3781,7 +3781,7 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
 
        root_sub_used(root, leaf->len);
 
-       btrfs_free_tree_block(trans, root, leaf, 0, 1);
+       btrfs_free_tree_block(trans, root, leaf, 0, 1, 0);
        return 0;
 }
 /*
index 67385033323d6e49817398a1df9b1596df07e839..3c2cbf7b666355b1106fb3fae80451b66ac56086 100644 (file)
@@ -86,6 +86,9 @@ struct btrfs_ordered_sum;
 /* holds checksums of all the data extents */
 #define BTRFS_CSUM_TREE_OBJECTID 7ULL
 
+/* for storing balance parameters in the root tree */
+#define BTRFS_BALANCE_OBJECTID -4ULL
+
 /* orhpan objectid for tracking unlinked/truncated files */
 #define BTRFS_ORPHAN_OBJECTID -5ULL
 
@@ -692,6 +695,54 @@ struct btrfs_root_ref {
        __le16 name_len;
 } __attribute__ ((__packed__));
 
+struct btrfs_disk_balance_args {
+       /*
+        * profiles to operate on, single is denoted by
+        * BTRFS_AVAIL_ALLOC_BIT_SINGLE
+        */
+       __le64 profiles;
+
+       /* usage filter */
+       __le64 usage;
+
+       /* devid filter */
+       __le64 devid;
+
+       /* devid subset filter [pstart..pend) */
+       __le64 pstart;
+       __le64 pend;
+
+       /* btrfs virtual address space subset filter [vstart..vend) */
+       __le64 vstart;
+       __le64 vend;
+
+       /*
+        * profile to convert to, single is denoted by
+        * BTRFS_AVAIL_ALLOC_BIT_SINGLE
+        */
+       __le64 target;
+
+       /* BTRFS_BALANCE_ARGS_* */
+       __le64 flags;
+
+       __le64 unused[8];
+} __attribute__ ((__packed__));
+
+/*
+ * store balance parameters to disk so that balance can be properly
+ * resumed after crash or unmount
+ */
+struct btrfs_balance_item {
+       /* BTRFS_BALANCE_* */
+       __le64 flags;
+
+       struct btrfs_disk_balance_args data;
+       struct btrfs_disk_balance_args meta;
+       struct btrfs_disk_balance_args sys;
+
+       __le64 unused[4];
+} __attribute__ ((__packed__));
+
 #define BTRFS_FILE_EXTENT_INLINE 0
 #define BTRFS_FILE_EXTENT_REG 1
 #define BTRFS_FILE_EXTENT_PREALLOC 2
@@ -751,14 +802,32 @@ struct btrfs_csum_item {
 } __attribute__ ((__packed__));
 
 /* different types of block groups (and chunks) */
-#define BTRFS_BLOCK_GROUP_DATA     (1 << 0)
-#define BTRFS_BLOCK_GROUP_SYSTEM   (1 << 1)
-#define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
-#define BTRFS_BLOCK_GROUP_RAID0    (1 << 3)
-#define BTRFS_BLOCK_GROUP_RAID1    (1 << 4)
-#define BTRFS_BLOCK_GROUP_DUP     (1 << 5)
-#define BTRFS_BLOCK_GROUP_RAID10   (1 << 6)
-#define BTRFS_NR_RAID_TYPES       5
+#define BTRFS_BLOCK_GROUP_DATA         (1ULL << 0)
+#define BTRFS_BLOCK_GROUP_SYSTEM       (1ULL << 1)
+#define BTRFS_BLOCK_GROUP_METADATA     (1ULL << 2)
+#define BTRFS_BLOCK_GROUP_RAID0                (1ULL << 3)
+#define BTRFS_BLOCK_GROUP_RAID1                (1ULL << 4)
+#define BTRFS_BLOCK_GROUP_DUP          (1ULL << 5)
+#define BTRFS_BLOCK_GROUP_RAID10       (1ULL << 6)
+#define BTRFS_BLOCK_GROUP_RESERVED     BTRFS_AVAIL_ALLOC_BIT_SINGLE
+#define BTRFS_NR_RAID_TYPES            5
+
+#define BTRFS_BLOCK_GROUP_TYPE_MASK    (BTRFS_BLOCK_GROUP_DATA |    \
+                                        BTRFS_BLOCK_GROUP_SYSTEM |  \
+                                        BTRFS_BLOCK_GROUP_METADATA)
+
+#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 |   \
+                                        BTRFS_BLOCK_GROUP_RAID1 |   \
+                                        BTRFS_BLOCK_GROUP_DUP |     \
+                                        BTRFS_BLOCK_GROUP_RAID10)
+/*
+ * We need a bit for restriper to be able to tell when chunks of type
+ * SINGLE are available.  This "extended" profile format is used in
+ * fs_info->avail_*_alloc_bits (in-memory) and balance item fields
+ * (on-disk).  The corresponding on-disk bit in chunk.type is reserved
+ * to avoid remappings between two formats in future.
+ */
+#define BTRFS_AVAIL_ALLOC_BIT_SINGLE   (1ULL << 48)
 
 struct btrfs_block_group_item {
        __le64 used;
@@ -916,6 +985,7 @@ struct btrfs_block_group_cache {
 struct reloc_control;
 struct btrfs_device;
 struct btrfs_fs_devices;
+struct btrfs_balance_control;
 struct btrfs_delayed_root;
 struct btrfs_fs_info {
        u8 fsid[BTRFS_FSID_SIZE];
@@ -971,7 +1041,7 @@ struct btrfs_fs_info {
         * is required instead of the faster short fsync log commits
         */
        u64 last_trans_log_full_commit;
-       unsigned long mount_opt:20;
+       unsigned long mount_opt:21;
        unsigned long compress_type:4;
        u64 max_inline;
        u64 alloc_start;
@@ -1132,12 +1202,23 @@ struct btrfs_fs_info {
        spinlock_t ref_cache_lock;
        u64 total_ref_cache_size;
 
+       /*
+        * these three are in extended format (availability of single
+        * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
+        * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits)
+        */
        u64 avail_data_alloc_bits;
        u64 avail_metadata_alloc_bits;
        u64 avail_system_alloc_bits;
-       u64 data_alloc_profile;
-       u64 metadata_alloc_profile;
-       u64 system_alloc_profile;
+
+       /* restriper state */
+       spinlock_t balance_lock;
+       struct mutex balance_mutex;
+       atomic_t balance_running;
+       atomic_t balance_pause_req;
+       atomic_t balance_cancel_req;
+       struct btrfs_balance_control *balance_ctl;
+       wait_queue_head_t balance_wait_q;
 
        unsigned data_chunk_allocations;
        unsigned metadata_ratio;
@@ -1155,6 +1236,10 @@ struct btrfs_fs_info {
        int scrub_workers_refcnt;
        struct btrfs_workers scrub_workers;
 
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+       u32 check_integrity_print_mask;
+#endif
+
        /* filesystem state */
        u64 fs_state;
 
@@ -1383,6 +1468,8 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_DEV_ITEM_KEY     216
 #define BTRFS_CHUNK_ITEM_KEY   228
 
+#define BTRFS_BALANCE_ITEM_KEY 248
+
 /*
  * string items are for debugging.  They just store a short string of
  * data in the FS
@@ -1413,6 +1500,9 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_MOUNT_AUTO_DEFRAG                (1 << 16)
 #define BTRFS_MOUNT_INODE_MAP_CACHE    (1 << 17)
 #define BTRFS_MOUNT_RECOVERY           (1 << 18)
+#define BTRFS_MOUNT_SKIP_BALANCE       (1 << 19)
+#define BTRFS_MOUNT_CHECK_INTEGRITY    (1 << 20)
+#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
 
 #define btrfs_clear_opt(o, opt)                ((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)          ((o) |= BTRFS_MOUNT_##opt)
@@ -2077,8 +2167,86 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
 BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
                   num_devices, 64);
 
-/* struct btrfs_super_block */
+/* struct btrfs_balance_item */
+BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
 
+static inline void btrfs_balance_data(struct extent_buffer *eb,
+                                     struct btrfs_balance_item *bi,
+                                     struct btrfs_disk_balance_args *ba)
+{
+       read_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+
+static inline void btrfs_set_balance_data(struct extent_buffer *eb,
+                                         struct btrfs_balance_item *bi,
+                                         struct btrfs_disk_balance_args *ba)
+{
+       write_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+
+static inline void btrfs_balance_meta(struct extent_buffer *eb,
+                                     struct btrfs_balance_item *bi,
+                                     struct btrfs_disk_balance_args *ba)
+{
+       read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
+
+static inline void btrfs_set_balance_meta(struct extent_buffer *eb,
+                                         struct btrfs_balance_item *bi,
+                                         struct btrfs_disk_balance_args *ba)
+{
+       write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
+
+static inline void btrfs_balance_sys(struct extent_buffer *eb,
+                                    struct btrfs_balance_item *bi,
+                                    struct btrfs_disk_balance_args *ba)
+{
+       read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
+
+static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
+                                        struct btrfs_balance_item *bi,
+                                        struct btrfs_disk_balance_args *ba)
+{
+       write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
+
+static inline void
+btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
+                              struct btrfs_disk_balance_args *disk)
+{
+       memset(cpu, 0, sizeof(*cpu));
+
+       cpu->profiles = le64_to_cpu(disk->profiles);
+       cpu->usage = le64_to_cpu(disk->usage);
+       cpu->devid = le64_to_cpu(disk->devid);
+       cpu->pstart = le64_to_cpu(disk->pstart);
+       cpu->pend = le64_to_cpu(disk->pend);
+       cpu->vstart = le64_to_cpu(disk->vstart);
+       cpu->vend = le64_to_cpu(disk->vend);
+       cpu->target = le64_to_cpu(disk->target);
+       cpu->flags = le64_to_cpu(disk->flags);
+}
+
+static inline void
+btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
+                              struct btrfs_balance_args *cpu)
+{
+       memset(disk, 0, sizeof(*disk));
+
+       disk->profiles = cpu_to_le64(cpu->profiles);
+       disk->usage = cpu_to_le64(cpu->usage);
+       disk->devid = cpu_to_le64(cpu->devid);
+       disk->pstart = cpu_to_le64(cpu->pstart);
+       disk->pend = cpu_to_le64(cpu->pend);
+       disk->vstart = cpu_to_le64(cpu->vstart);
+       disk->vend = cpu_to_le64(cpu->vend);
+       disk->target = cpu_to_le64(cpu->target);
+       disk->flags = cpu_to_le64(cpu->flags);
+}
+
+/* struct btrfs_super_block */
 BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
 BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
 BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
@@ -2277,11 +2445,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                                        struct btrfs_root *root, u32 blocksize,
                                        u64 parent, u64 root_objectid,
                                        struct btrfs_disk_key *key, int level,
-                                       u64 hint, u64 empty_size);
+                                       u64 hint, u64 empty_size, int for_cow);
 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           struct extent_buffer *buf,
-                          u64 parent, int last_ref);
+                          u64 parent, int last_ref, int for_cow);
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
                                            struct btrfs_root *root,
                                            u64 bytenr, u32 blocksize,
@@ -2301,17 +2469,17 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
                                  u64 search_end, struct btrfs_key *ins,
                                  u64 data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                 struct extent_buffer *buf, int full_backref);
+                 struct extent_buffer *buf, int full_backref, int for_cow);
 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                 struct extent_buffer *buf, int full_backref);
+                 struct extent_buffer *buf, int full_backref, int for_cow);
 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
                                u64 bytenr, u64 num_bytes, u64 flags,
                                int is_data);
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root,
-                     u64 bytenr, u64 num_bytes, u64 parent,
-                     u64 root_objectid, u64 owner, u64 offset);
+                     u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
+                     u64 owner, u64 offset, int for_cow);
 
 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
@@ -2323,7 +2491,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root,
                         u64 bytenr, u64 num_bytes, u64 parent,
-                        u64 root_objectid, u64 owner, u64 offset);
+                        u64 root_objectid, u64 owner, u64 offset, int for_cow);
 
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                                    struct btrfs_root *root);
@@ -2482,10 +2650,18 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
 }
 
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
+static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
+{
+       ++p->slots[0];
+       if (p->slots[0] >= btrfs_header_nritems(p->nodes[0]))
+               return btrfs_next_leaf(root, p);
+       return 0;
+}
 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
 void btrfs_drop_snapshot(struct btrfs_root *root,
-                        struct btrfs_block_rsv *block_rsv, int update_ref);
+                        struct btrfs_block_rsv *block_rsv, int update_ref,
+                        int for_reloc);
 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
                        struct extent_buffer *node,
@@ -2500,6 +2676,7 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
 }
 static inline void free_fs_info(struct btrfs_fs_info *fs_info)
 {
+       kfree(fs_info->balance_ctl);
        kfree(fs_info->delayed_root);
        kfree(fs_info->extent_root);
        kfree(fs_info->tree_root);
@@ -2510,6 +2687,24 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
        kfree(fs_info->super_for_commit);
        kfree(fs_info);
 }
+/**
+ * profile_is_valid - tests whether a given profile is valid and reduced
+ * @flags: profile to validate
+ * @extended: if true @flags is treated as an extended profile
+ */
+static inline int profile_is_valid(u64 flags, int extended)
+{
+       u64 mask = ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+       flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
+       if (extended)
+               mask &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+       if (flags & mask)
+               return 0;
+       /* true if zero or exactly one bit set */
+       return (flags & (~flags + 1)) == flags;
+}
 
 /* root-item.c */
 int btrfs_find_root_ref(struct btrfs_root *tree_root,
index 9c1eccc2c503e5eec8bd20d3dfc057a417eef89a..fe4cd0f1cef188b8cf584c67c8ab0f58ffb62cbb 100644 (file)
@@ -595,8 +595,12 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
 
        num_bytes = btrfs_calc_trans_metadata_size(root, 1);
        ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
-       if (!ret)
+       if (!ret) {
+               trace_btrfs_space_reservation(root->fs_info, "delayed_item",
+                                             item->key.objectid,
+                                             num_bytes, 1);
                item->bytes_reserved = num_bytes;
+       }
 
        return ret;
 }
@@ -610,6 +614,9 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
                return;
 
        rsv = &root->fs_info->delayed_block_rsv;
+       trace_btrfs_space_reservation(root->fs_info, "delayed_item",
+                                     item->key.objectid, item->bytes_reserved,
+                                     0);
        btrfs_block_rsv_release(root, rsv,
                                item->bytes_reserved);
 }
@@ -624,7 +631,7 @@ static int btrfs_delayed_inode_reserve_metadata(
        struct btrfs_block_rsv *dst_rsv;
        u64 num_bytes;
        int ret;
-       int release = false;
+       bool release = false;
 
        src_rsv = trans->block_rsv;
        dst_rsv = &root->fs_info->delayed_block_rsv;
@@ -651,8 +658,13 @@ static int btrfs_delayed_inode_reserve_metadata(
                 */
                if (ret == -EAGAIN)
                        ret = -ENOSPC;
-               if (!ret)
+               if (!ret) {
                        node->bytes_reserved = num_bytes;
+                       trace_btrfs_space_reservation(root->fs_info,
+                                                     "delayed_inode",
+                                                     btrfs_ino(inode),
+                                                     num_bytes, 1);
+               }
                return ret;
        } else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
                spin_lock(&BTRFS_I(inode)->lock);
@@ -707,11 +719,17 @@ out:
         * reservation here.  I think it may be time for a documentation page on
         * how block rsvs. work.
         */
-       if (!ret)
+       if (!ret) {
+               trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
+                                             btrfs_ino(inode), num_bytes, 1);
                node->bytes_reserved = num_bytes;
+       }
 
-       if (release)
+       if (release) {
+               trace_btrfs_space_reservation(root->fs_info, "delalloc",
+                                             btrfs_ino(inode), num_bytes, 0);
                btrfs_block_rsv_release(root, src_rsv, num_bytes);
+       }
 
        return ret;
 }
@@ -725,6 +743,8 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
                return;
 
        rsv = &root->fs_info->delayed_block_rsv;
+       trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
+                                     node->inode_id, node->bytes_reserved, 0);
        btrfs_block_rsv_release(root, rsv,
                                node->bytes_reserved);
        node->bytes_reserved = 0;
@@ -1372,13 +1392,6 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
                goto release_node;
        }
 
-       ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
-       /*
-        * we have reserved enough space when we start a new transaction,
-        * so reserving metadata failure is impossible
-        */
-       BUG_ON(ret);
-
        delayed_item->key.objectid = btrfs_ino(dir);
        btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY);
        delayed_item->key.offset = index;
@@ -1391,6 +1404,14 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
        dir_item->type = type;
        memcpy((char *)(dir_item + 1), name, name_len);
 
+       ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
+       /*
+        * we have reserved enough space when we start a new transaction,
+        * so reserving metadata failure is impossible
+        */
+       BUG_ON(ret);
+
+
        mutex_lock(&delayed_node->mutex);
        ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item);
        if (unlikely(ret)) {
index 125cf76fcd086803d35bd257bdb54168486e0e0b..66e4f29505a33dbecd45b5d6a80e878c87818bc0 100644 (file)
@@ -101,6 +101,11 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2,
                return -1;
        if (ref1->type > ref2->type)
                return 1;
+       /* merging of sequenced refs is not allowed */
+       if (ref1->seq < ref2->seq)
+               return -1;
+       if (ref1->seq > ref2->seq)
+               return 1;
        if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
            ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
                return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
@@ -150,16 +155,22 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
 
 /*
  * find an head entry based on bytenr. This returns the delayed ref
- * head if it was able to find one, or NULL if nothing was in that spot
+ * head if it was able to find one, or NULL if nothing was in that spot.
+ * If return_bigger is given, the next bigger entry is returned if no exact
+ * match is found.
  */
 static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
                                  u64 bytenr,
-                                 struct btrfs_delayed_ref_node **last)
+                                 struct btrfs_delayed_ref_node **last,
+                                 int return_bigger)
 {
-       struct rb_node *n = root->rb_node;
+       struct rb_node *n;
        struct btrfs_delayed_ref_node *entry;
-       int cmp;
+       int cmp = 0;
 
+again:
+       n = root->rb_node;
+       entry = NULL;
        while (n) {
                entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
                WARN_ON(!entry->in_tree);
@@ -182,6 +193,19 @@ static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
                else
                        return entry;
        }
+       if (entry && return_bigger) {
+               if (cmp > 0) {
+                       n = rb_next(&entry->rb_node);
+                       if (!n)
+                               n = rb_first(root);
+                       entry = rb_entry(n, struct btrfs_delayed_ref_node,
+                                        rb_node);
+                       bytenr = entry->bytenr;
+                       return_bigger = 0;
+                       goto again;
+               }
+               return entry;
+       }
        return NULL;
 }
 
@@ -209,6 +233,24 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
        return 0;
 }
 
+int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+                           u64 seq)
+{
+       struct seq_list *elem;
+
+       assert_spin_locked(&delayed_refs->lock);
+       if (list_empty(&delayed_refs->seq_head))
+               return 0;
+
+       elem = list_first_entry(&delayed_refs->seq_head, struct seq_list, list);
+       if (seq >= elem->seq) {
+               pr_debug("holding back delayed_ref %llu, lowest is %llu (%p)\n",
+                        seq, elem->seq, delayed_refs);
+               return 1;
+       }
+       return 0;
+}
+
 int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
                           struct list_head *cluster, u64 start)
 {
@@ -223,20 +265,8 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
                node = rb_first(&delayed_refs->root);
        } else {
                ref = NULL;
-               find_ref_head(&delayed_refs->root, start, &ref);
+               find_ref_head(&delayed_refs->root, start + 1, &ref, 1);
                if (ref) {
-                       struct btrfs_delayed_ref_node *tmp;
-
-                       node = rb_prev(&ref->rb_node);
-                       while (node) {
-                               tmp = rb_entry(node,
-                                              struct btrfs_delayed_ref_node,
-                                              rb_node);
-                               if (tmp->bytenr < start)
-                                       break;
-                               ref = tmp;
-                               node = rb_prev(&ref->rb_node);
-                       }
                        node = &ref->rb_node;
                } else
                        node = rb_first(&delayed_refs->root);
@@ -390,7 +420,8 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
  * this does all the dirty work in terms of maintaining the correct
  * overall modification count.
  */
-static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
+static noinline int add_delayed_ref_head(struct btrfs_fs_info *fs_info,
+                                       struct btrfs_trans_handle *trans,
                                        struct btrfs_delayed_ref_node *ref,
                                        u64 bytenr, u64 num_bytes,
                                        int action, int is_data)
@@ -437,6 +468,7 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
        ref->action  = 0;
        ref->is_head = 1;
        ref->in_tree = 1;
+       ref->seq = 0;
 
        head_ref = btrfs_delayed_node_to_head(ref);
        head_ref->must_insert_reserved = must_insert_reserved;
@@ -468,14 +500,17 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
 /*
  * helper to insert a delayed tree ref into the rbtree.
  */
-static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+                                        struct btrfs_trans_handle *trans,
                                         struct btrfs_delayed_ref_node *ref,
                                         u64 bytenr, u64 num_bytes, u64 parent,
-                                        u64 ref_root, int level, int action)
+                                        u64 ref_root, int level, int action,
+                                        int for_cow)
 {
        struct btrfs_delayed_ref_node *existing;
        struct btrfs_delayed_tree_ref *full_ref;
        struct btrfs_delayed_ref_root *delayed_refs;
+       u64 seq = 0;
 
        if (action == BTRFS_ADD_DELAYED_EXTENT)
                action = BTRFS_ADD_DELAYED_REF;
@@ -491,14 +526,17 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
        ref->is_head = 0;
        ref->in_tree = 1;
 
+       if (need_ref_seq(for_cow, ref_root))
+               seq = inc_delayed_seq(delayed_refs);
+       ref->seq = seq;
+
        full_ref = btrfs_delayed_node_to_tree_ref(ref);
-       if (parent) {
-               full_ref->parent = parent;
+       full_ref->parent = parent;
+       full_ref->root = ref_root;
+       if (parent)
                ref->type = BTRFS_SHARED_BLOCK_REF_KEY;
-       } else {
-               full_ref->root = ref_root;
+       else
                ref->type = BTRFS_TREE_BLOCK_REF_KEY;
-       }
        full_ref->level = level;
 
        trace_btrfs_delayed_tree_ref(ref, full_ref, action);
@@ -522,15 +560,17 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
 /*
  * helper to insert a delayed data ref into the rbtree.
  */
-static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
+static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+                                        struct btrfs_trans_handle *trans,
                                         struct btrfs_delayed_ref_node *ref,
                                         u64 bytenr, u64 num_bytes, u64 parent,
                                         u64 ref_root, u64 owner, u64 offset,
-                                        int action)
+                                        int action, int for_cow)
 {
        struct btrfs_delayed_ref_node *existing;
        struct btrfs_delayed_data_ref *full_ref;
        struct btrfs_delayed_ref_root *delayed_refs;
+       u64 seq = 0;
 
        if (action == BTRFS_ADD_DELAYED_EXTENT)
                action = BTRFS_ADD_DELAYED_REF;
@@ -546,14 +586,18 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
        ref->is_head = 0;
        ref->in_tree = 1;
 
+       if (need_ref_seq(for_cow, ref_root))
+               seq = inc_delayed_seq(delayed_refs);
+       ref->seq = seq;
+
        full_ref = btrfs_delayed_node_to_data_ref(ref);
-       if (parent) {
-               full_ref->parent = parent;
+       full_ref->parent = parent;
+       full_ref->root = ref_root;
+       if (parent)
                ref->type = BTRFS_SHARED_DATA_REF_KEY;
-       } else {
-               full_ref->root = ref_root;
+       else
                ref->type = BTRFS_EXTENT_DATA_REF_KEY;
-       }
+
        full_ref->objectid = owner;
        full_ref->offset = offset;
 
@@ -580,10 +624,12 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
  * to make sure the delayed ref is eventually processed before this
  * transaction commits.
  */
-int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+                              struct btrfs_trans_handle *trans,
                               u64 bytenr, u64 num_bytes, u64 parent,
                               u64 ref_root,  int level, int action,
-                              struct btrfs_delayed_extent_op *extent_op)
+                              struct btrfs_delayed_extent_op *extent_op,
+                              int for_cow)
 {
        struct btrfs_delayed_tree_ref *ref;
        struct btrfs_delayed_ref_head *head_ref;
@@ -610,13 +656,17 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
         * insert both the head node and the new ref without dropping
         * the spin lock
         */
-       ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
-                                  action, 0);
+       ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
+                                  num_bytes, action, 0);
        BUG_ON(ret);
 
-       ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes,
-                                  parent, ref_root, level, action);
+       ret = add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
+                                  num_bytes, parent, ref_root, level, action,
+                                  for_cow);
        BUG_ON(ret);
+       if (!need_ref_seq(for_cow, ref_root) &&
+           waitqueue_active(&delayed_refs->seq_wait))
+               wake_up(&delayed_refs->seq_wait);
        spin_unlock(&delayed_refs->lock);
        return 0;
 }
@@ -624,11 +674,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
 /*
  * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
  */
-int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+                              struct btrfs_trans_handle *trans,
                               u64 bytenr, u64 num_bytes,
                               u64 parent, u64 ref_root,
                               u64 owner, u64 offset, int action,
-                              struct btrfs_delayed_extent_op *extent_op)
+                              struct btrfs_delayed_extent_op *extent_op,
+                              int for_cow)
 {
        struct btrfs_delayed_data_ref *ref;
        struct btrfs_delayed_ref_head *head_ref;
@@ -655,18 +707,23 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
         * insert both the head node and the new ref without dropping
         * the spin lock
         */
-       ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
-                                  action, 1);
+       ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
+                                  num_bytes, action, 1);
        BUG_ON(ret);
 
-       ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes,
-                                  parent, ref_root, owner, offset, action);
+       ret = add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
+                                  num_bytes, parent, ref_root, owner, offset,
+                                  action, for_cow);
        BUG_ON(ret);
+       if (!need_ref_seq(for_cow, ref_root) &&
+           waitqueue_active(&delayed_refs->seq_wait))
+               wake_up(&delayed_refs->seq_wait);
        spin_unlock(&delayed_refs->lock);
        return 0;
 }
 
-int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
+                               struct btrfs_trans_handle *trans,
                                u64 bytenr, u64 num_bytes,
                                struct btrfs_delayed_extent_op *extent_op)
 {
@@ -683,11 +740,13 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
        delayed_refs = &trans->transaction->delayed_refs;
        spin_lock(&delayed_refs->lock);
 
-       ret = add_delayed_ref_head(trans, &head_ref->node, bytenr,
+       ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
                                   num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
                                   extent_op->is_data);
        BUG_ON(ret);
 
+       if (waitqueue_active(&delayed_refs->seq_wait))
+               wake_up(&delayed_refs->seq_wait);
        spin_unlock(&delayed_refs->lock);
        return 0;
 }
@@ -704,7 +763,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
        struct btrfs_delayed_ref_root *delayed_refs;
 
        delayed_refs = &trans->transaction->delayed_refs;
-       ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
+       ref = find_ref_head(&delayed_refs->root, bytenr, NULL, 0);
        if (ref)
                return btrfs_delayed_node_to_head(ref);
        return NULL;
index e287e3b0eab0d970d37f0f4c70fd688b22f276ac..d8f244d9492511e3b108b26bcf4da1bc9fbf6826 100644 (file)
@@ -33,6 +33,9 @@ struct btrfs_delayed_ref_node {
        /* the size of the extent */
        u64 num_bytes;
 
+       /* seq number to keep track of insertion order */
+       u64 seq;
+
        /* ref count on this data structure */
        atomic_t refs;
 
@@ -98,19 +101,15 @@ struct btrfs_delayed_ref_head {
 
 struct btrfs_delayed_tree_ref {
        struct btrfs_delayed_ref_node node;
-       union {
-               u64 root;
-               u64 parent;
-       };
+       u64 root;
+       u64 parent;
        int level;
 };
 
 struct btrfs_delayed_data_ref {
        struct btrfs_delayed_ref_node node;
-       union {
-               u64 root;
-               u64 parent;
-       };
+       u64 root;
+       u64 parent;
        u64 objectid;
        u64 offset;
 };
@@ -140,6 +139,26 @@ struct btrfs_delayed_ref_root {
        int flushing;
 
        u64 run_delayed_start;
+
+       /*
+        * seq number of delayed refs. We need to know if a backref was being
+        * added before the currently processed ref or afterwards.
+        */
+       u64 seq;
+
+       /*
+        * seq_list holds a list of all seq numbers that are currently being
+        * added to the list. While walking backrefs (btrfs_find_all_roots,
+        * qgroups), which might take some time, no newer ref must be processed,
+        * as it might influence the outcome of the walk.
+        */
+       struct list_head seq_head;
+
+       /*
+        * when the only refs we have in the list must not be processed, we want
+        * to wait for more refs to show up or for the end of backref walking.
+        */
+       wait_queue_head_t seq_wait;
 };
 
 static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
@@ -151,16 +170,21 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
        }
 }
 
-int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+                              struct btrfs_trans_handle *trans,
                               u64 bytenr, u64 num_bytes, u64 parent,
                               u64 ref_root, int level, int action,
-                              struct btrfs_delayed_extent_op *extent_op);
-int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+                              struct btrfs_delayed_extent_op *extent_op,
+                              int for_cow);
+int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+                              struct btrfs_trans_handle *trans,
                               u64 bytenr, u64 num_bytes,
                               u64 parent, u64 ref_root,
                               u64 owner, u64 offset, int action,
-                              struct btrfs_delayed_extent_op *extent_op);
-int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+                              struct btrfs_delayed_extent_op *extent_op,
+                              int for_cow);
+int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
+                               struct btrfs_trans_handle *trans,
                                u64 bytenr, u64 num_bytes,
                                struct btrfs_delayed_extent_op *extent_op);
 
@@ -170,6 +194,60 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
                           struct btrfs_delayed_ref_head *head);
 int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
                           struct list_head *cluster, u64 search_start);
+
+struct seq_list {
+       struct list_head list;
+       u64 seq;
+};
+
+static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs)
+{
+       assert_spin_locked(&delayed_refs->lock);
+       ++delayed_refs->seq;
+       return delayed_refs->seq;
+}
+
+static inline void
+btrfs_get_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+                     struct seq_list *elem)
+{
+       assert_spin_locked(&delayed_refs->lock);
+       elem->seq = delayed_refs->seq;
+       list_add_tail(&elem->list, &delayed_refs->seq_head);
+}
+
+static inline void
+btrfs_put_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+                     struct seq_list *elem)
+{
+       spin_lock(&delayed_refs->lock);
+       list_del(&elem->list);
+       wake_up(&delayed_refs->seq_wait);
+       spin_unlock(&delayed_refs->lock);
+}
+
+int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
+                           u64 seq);
+
+/*
+ * delayed refs with a ref_seq > 0 must be held back during backref walking.
+ * this only applies to items in one of the fs-trees. for_cow items never need
+ * to be held back, so they won't get a ref_seq number.
+ */
+static inline int need_ref_seq(int for_cow, u64 rootid)
+{
+       if (for_cow)
+               return 0;
+
+       if (rootid == BTRFS_FS_TREE_OBJECTID)
+               return 1;
+
+       if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
+               return 1;
+
+       return 0;
+}
+
 /*
  * a node might live in a head or a regular ref, this lets you
  * test for the proper type to use.
index d8525662ca7a721b18e197191816ea16b095eb19..89e99eb384db54a8006ae991bd83ea2e61abe0eb 100644 (file)
@@ -43,6 +43,7 @@
 #include "tree-log.h"
 #include "free-space-cache.h"
 #include "inode-map.h"
+#include "check-integrity.h"
 
 static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
@@ -1244,7 +1245,8 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
        root->ref_cows = 0;
 
        leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
-                                     BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0);
+                                     BTRFS_TREE_LOG_OBJECTID, NULL,
+                                     0, 0, 0, 0);
        if (IS_ERR(leaf)) {
                kfree(root);
                return ERR_CAST(leaf);
@@ -1998,6 +2000,17 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        init_waitqueue_head(&fs_info->scrub_pause_wait);
        init_rwsem(&fs_info->scrub_super_lock);
        fs_info->scrub_workers_refcnt = 0;
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+       fs_info->check_integrity_print_mask = 0;
+#endif
+
+       spin_lock_init(&fs_info->balance_lock);
+       mutex_init(&fs_info->balance_mutex);
+       atomic_set(&fs_info->balance_running, 0);
+       atomic_set(&fs_info->balance_pause_req, 0);
+       atomic_set(&fs_info->balance_cancel_req, 0);
+       fs_info->balance_ctl = NULL;
+       init_waitqueue_head(&fs_info->balance_wait_q);
 
        sb->s_blocksize = 4096;
        sb->s_blocksize_bits = blksize_bits(4096);
@@ -2267,9 +2280,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
           (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
           BTRFS_UUID_SIZE);
 
-       mutex_lock(&fs_info->chunk_mutex);
        ret = btrfs_read_chunk_tree(chunk_root);
-       mutex_unlock(&fs_info->chunk_mutex);
        if (ret) {
                printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
                       sb->s_id);
@@ -2318,9 +2329,6 @@ retry_root_backup:
 
        fs_info->generation = generation;
        fs_info->last_trans_committed = generation;
-       fs_info->data_alloc_profile = (u64)-1;
-       fs_info->metadata_alloc_profile = (u64)-1;
-       fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
 
        ret = btrfs_init_space_info(fs_info);
        if (ret) {
@@ -2353,6 +2361,19 @@ retry_root_backup:
                btrfs_set_opt(fs_info->mount_opt, SSD);
        }
 
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+       if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
+               ret = btrfsic_mount(tree_root, fs_devices,
+                                   btrfs_test_opt(tree_root,
+                                       CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ?
+                                   1 : 0,
+                                   fs_info->check_integrity_print_mask);
+               if (ret)
+                       printk(KERN_WARNING "btrfs: failed to initialize"
+                              " integrity check module %s\n", sb->s_id);
+       }
+#endif
+
        /* do not make disk changes in broken FS */
        if (btrfs_super_log_root(disk_super) != 0 &&
            !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
@@ -2423,6 +2444,10 @@ retry_root_backup:
                if (!err)
                        err = btrfs_orphan_cleanup(fs_info->tree_root);
                up_read(&fs_info->cleanup_work_sem);
+
+               if (!err)
+                       err = btrfs_recover_balance(fs_info->tree_root);
+
                if (err) {
                        close_ctree(tree_root);
                        return ERR_PTR(err);
@@ -2631,7 +2656,7 @@ static int write_dev_supers(struct btrfs_device *device,
                 * we fua the first super.  The others we allow
                 * to go down lazy.
                 */
-               ret = submit_bh(WRITE_FUA, bh);
+               ret = btrfsic_submit_bh(WRITE_FUA, bh);
                if (ret)
                        errors++;
        }
@@ -2708,7 +2733,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
        device->flush_bio = bio;
 
        bio_get(bio);
-       submit_bio(WRITE_FLUSH, bio);
+       btrfsic_submit_bio(WRITE_FLUSH, bio);
 
        return 0;
 }
@@ -2972,6 +2997,9 @@ int close_ctree(struct btrfs_root *root)
        fs_info->closing = 1;
        smp_mb();
 
+       /* pause restriper - we want to resume on mount */
+       btrfs_pause_balance(root->fs_info);
+
        btrfs_scrub_cancel(root);
 
        /* wait for any defraggers to finish */
@@ -3054,6 +3082,11 @@ int close_ctree(struct btrfs_root *root)
        btrfs_stop_workers(&fs_info->caching_workers);
        btrfs_stop_workers(&fs_info->readahead_workers);
 
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+       if (btrfs_test_opt(root, CHECK_INTEGRITY))
+               btrfsic_unmount(root, fs_info->fs_devices);
+#endif
+
        btrfs_close_devices(fs_info->fs_devices);
        btrfs_mapping_tree_free(&fs_info->mapping_tree);
 
index f5fbe576d2baf48519a01bd449344b49edffa070..700879ed64cfcc73f43f6e03819e579578db1e57 100644 (file)
@@ -618,8 +618,7 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
        struct list_head *head = &info->space_info;
        struct btrfs_space_info *found;
 
-       flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
-                BTRFS_BLOCK_GROUP_METADATA;
+       flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
 
        rcu_read_lock();
        list_for_each_entry_rcu(found, head, list) {
@@ -1872,20 +1871,24 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root,
                         u64 bytenr, u64 num_bytes, u64 parent,
-                        u64 root_objectid, u64 owner, u64 offset)
+                        u64 root_objectid, u64 owner, u64 offset, int for_cow)
 {
        int ret;
+       struct btrfs_fs_info *fs_info = root->fs_info;
+
        BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
               root_objectid == BTRFS_TREE_LOG_OBJECTID);
 
        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
-               ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
+               ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
+                                       num_bytes,
                                        parent, root_objectid, (int)owner,
-                                       BTRFS_ADD_DELAYED_REF, NULL);
+                                       BTRFS_ADD_DELAYED_REF, NULL, for_cow);
        } else {
-               ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
+               ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
+                                       num_bytes,
                                        parent, root_objectid, owner, offset,
-                                       BTRFS_ADD_DELAYED_REF, NULL);
+                                       BTRFS_ADD_DELAYED_REF, NULL, for_cow);
        }
        return ret;
 }
@@ -2232,6 +2235,28 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                        }
                }
 
+               /*
+                * locked_ref is the head node, so we have to go one
+                * node back for any delayed ref updates
+                */
+               ref = select_delayed_ref(locked_ref);
+
+               if (ref && ref->seq &&
+                   btrfs_check_delayed_seq(delayed_refs, ref->seq)) {
+                       /*
+                        * there are still refs with lower seq numbers in the
+                        * process of being added. Don't run this ref yet.
+                        */
+                       list_del_init(&locked_ref->cluster);
+                       mutex_unlock(&locked_ref->mutex);
+                       locked_ref = NULL;
+                       delayed_refs->num_heads_ready++;
+                       spin_unlock(&delayed_refs->lock);
+                       cond_resched();
+                       spin_lock(&delayed_refs->lock);
+                       continue;
+               }
+
                /*
                 * record the must insert reserved flag before we
                 * drop the spin lock.
@@ -2242,11 +2267,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                extent_op = locked_ref->extent_op;
                locked_ref->extent_op = NULL;
 
-               /*
-                * locked_ref is the head node, so we have to go one
-                * node back for any delayed ref updates
-                */
-               ref = select_delayed_ref(locked_ref);
                if (!ref) {
                        /* All delayed refs have been processed, Go ahead
                         * and send the head node to run_one_delayed_ref,
@@ -2267,9 +2287,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                                BUG_ON(ret);
                                kfree(extent_op);
 
-                               cond_resched();
-                               spin_lock(&delayed_refs->lock);
-                               continue;
+                               goto next;
                        }
 
                        list_del_init(&locked_ref->cluster);
@@ -2279,7 +2297,12 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                ref->in_tree = 0;
                rb_erase(&ref->rb_node, &delayed_refs->root);
                delayed_refs->num_entries--;
-
+               /*
+                * we modified num_entries, but as we're currently running
+                * delayed refs, skip
+                *     wake_up(&delayed_refs->seq_wait);
+                * here.
+                */
                spin_unlock(&delayed_refs->lock);
 
                ret = run_one_delayed_ref(trans, root, ref, extent_op,
@@ -2289,13 +2312,34 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                btrfs_put_delayed_ref(ref);
                kfree(extent_op);
                count++;
-
+next:
+               do_chunk_alloc(trans, root->fs_info->extent_root,
+                              2 * 1024 * 1024,
+                              btrfs_get_alloc_profile(root, 0),
+                              CHUNK_ALLOC_NO_FORCE);
                cond_resched();
                spin_lock(&delayed_refs->lock);
        }
        return count;
 }
 
+
+static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs,
+                       unsigned long num_refs)
+{
+       struct list_head *first_seq = delayed_refs->seq_head.next;
+
+       spin_unlock(&delayed_refs->lock);
+       pr_debug("waiting for more refs (num %ld, first %p)\n",
+                num_refs, first_seq);
+       wait_event(delayed_refs->seq_wait,
+                  num_refs != delayed_refs->num_entries ||
+                  delayed_refs->seq_head.next != first_seq);
+       pr_debug("done waiting for more refs (num %ld, first %p)\n",
+                delayed_refs->num_entries, delayed_refs->seq_head.next);
+       spin_lock(&delayed_refs->lock);
+}
+
 /*
  * this starts processing the delayed reference count updates and
  * extent insertions we have queued up so far.  count can be
@@ -2311,15 +2355,23 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
        struct btrfs_delayed_ref_node *ref;
        struct list_head cluster;
        int ret;
+       u64 delayed_start;
        int run_all = count == (unsigned long)-1;
        int run_most = 0;
+       unsigned long num_refs = 0;
+       int consider_waiting;
 
        if (root == root->fs_info->extent_root)
                root = root->fs_info->tree_root;
 
+       do_chunk_alloc(trans, root->fs_info->extent_root,
+                      2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
+                      CHUNK_ALLOC_NO_FORCE);
+
        delayed_refs = &trans->transaction->delayed_refs;
        INIT_LIST_HEAD(&cluster);
 again:
+       consider_waiting = 0;
        spin_lock(&delayed_refs->lock);
        if (count == 0) {
                count = delayed_refs->num_entries * 2;
@@ -2336,11 +2388,35 @@ again:
                 * of refs to process starting at the first one we are able to
                 * lock
                 */
+               delayed_start = delayed_refs->run_delayed_start;
                ret = btrfs_find_ref_cluster(trans, &cluster,
                                             delayed_refs->run_delayed_start);
                if (ret)
                        break;
 
+               if (delayed_start >= delayed_refs->run_delayed_start) {
+                       if (consider_waiting == 0) {
+                               /*
+                                * btrfs_find_ref_cluster looped. let's do one
+                                * more cycle. if we don't run any delayed ref
+                                * during that cycle (because we can't because
+                                * all of them are blocked) and if the number of
+                                * refs doesn't change, we avoid busy waiting.
+                                */
+                               consider_waiting = 1;
+                               num_refs = delayed_refs->num_entries;
+                       } else {
+                               wait_for_more_refs(delayed_refs, num_refs);
+                               /*
+                                * after waiting, things have changed. we
+                                * dropped the lock and someone else might have
+                                * run some refs, built new clusters and so on.
+                                * therefore, we restart staleness detection.
+                                */
+                               consider_waiting = 0;
+                       }
+               }
+
                ret = run_clustered_refs(trans, root, &cluster);
                BUG_ON(ret < 0);
 
@@ -2348,6 +2424,11 @@ again:
 
                if (count == 0)
                        break;
+
+               if (ret || delayed_refs->run_delayed_start == 0) {
+                       /* refs were run, let's reset staleness detection */
+                       consider_waiting = 0;
+               }
        }
 
        if (run_all) {
@@ -2405,7 +2486,8 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
        extent_op->update_key = 0;
        extent_op->is_data = is_data ? 1 : 0;
 
-       ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
+       ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
+                                         num_bytes, extent_op);
        if (ret)
                kfree(extent_op);
        return ret;
@@ -2590,7 +2672,7 @@ out:
 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           struct extent_buffer *buf,
-                          int full_backref, int inc)
+                          int full_backref, int inc, int for_cow)
 {
        u64 bytenr;
        u64 num_bytes;
@@ -2603,7 +2685,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
        int level;
        int ret = 0;
        int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
-                           u64, u64, u64, u64, u64, u64);
+                           u64, u64, u64, u64, u64, u64, int);
 
        ref_root = btrfs_header_owner(buf);
        nritems = btrfs_header_nritems(buf);
@@ -2640,14 +2722,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
                        key.offset -= btrfs_file_extent_offset(buf, fi);
                        ret = process_func(trans, root, bytenr, num_bytes,
                                           parent, ref_root, key.objectid,
-                                          key.offset);
+                                          key.offset, for_cow);
                        if (ret)
                                goto fail;
                } else {
                        bytenr = btrfs_node_blockptr(buf, i);
                        num_bytes = btrfs_level_size(root, level - 1);
                        ret = process_func(trans, root, bytenr, num_bytes,
-                                          parent, ref_root, level - 1, 0);
+                                          parent, ref_root, level - 1, 0,
+                                          for_cow);
                        if (ret)
                                goto fail;
                }
@@ -2659,15 +2742,15 @@ fail:
 }
 
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                 struct extent_buffer *buf, int full_backref)
+                 struct extent_buffer *buf, int full_backref, int for_cow)
 {
-       return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
+       return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
 }
 
 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                 struct extent_buffer *buf, int full_backref)
+                 struct extent_buffer *buf, int full_backref, int for_cow)
 {
-       return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
+       return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
 }
 
 static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -2993,9 +3076,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
                INIT_LIST_HEAD(&found->block_groups[i]);
        init_rwsem(&found->groups_sem);
        spin_lock_init(&found->lock);
-       found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
-                               BTRFS_BLOCK_GROUP_SYSTEM |
-                               BTRFS_BLOCK_GROUP_METADATA);
+       found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
        found->total_bytes = total_bytes;
        found->disk_total = total_bytes * factor;
        found->bytes_used = bytes_used;
@@ -3016,20 +3097,27 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 
 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 {
-       u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
-                                  BTRFS_BLOCK_GROUP_RAID1 |
-                                  BTRFS_BLOCK_GROUP_RAID10 |
-                                  BTRFS_BLOCK_GROUP_DUP);
-       if (extra_flags) {
-               if (flags & BTRFS_BLOCK_GROUP_DATA)
-                       fs_info->avail_data_alloc_bits |= extra_flags;
-               if (flags & BTRFS_BLOCK_GROUP_METADATA)
-                       fs_info->avail_metadata_alloc_bits |= extra_flags;
-               if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
-                       fs_info->avail_system_alloc_bits |= extra_flags;
-       }
+       u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+       /* chunk -> extended profile */
+       if (extra_flags == 0)
+               extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+       if (flags & BTRFS_BLOCK_GROUP_DATA)
+               fs_info->avail_data_alloc_bits |= extra_flags;
+       if (flags & BTRFS_BLOCK_GROUP_METADATA)
+               fs_info->avail_metadata_alloc_bits |= extra_flags;
+       if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+               fs_info->avail_system_alloc_bits |= extra_flags;
 }
 
+/*
+ * @flags: available profiles in extended format (see ctree.h)
+ *
+ * Returns reduced profile in chunk format.  If profile changing is in
+ * progress (either running or paused) picks the target profile (if it's
+ * already available), otherwise falls back to plain reducing.
+ */
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 {
        /*
@@ -3040,6 +3128,34 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
        u64 num_devices = root->fs_info->fs_devices->rw_devices +
                root->fs_info->fs_devices->missing_devices;
 
+       /* pick restriper's target profile if it's available */
+       spin_lock(&root->fs_info->balance_lock);
+       if (root->fs_info->balance_ctl) {
+               struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
+               u64 tgt = 0;
+
+               if ((flags & BTRFS_BLOCK_GROUP_DATA) &&
+                   (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+                   (flags & bctl->data.target)) {
+                       tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
+               } else if ((flags & BTRFS_BLOCK_GROUP_SYSTEM) &&
+                          (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+                          (flags & bctl->sys.target)) {
+                       tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
+               } else if ((flags & BTRFS_BLOCK_GROUP_METADATA) &&
+                          (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+                          (flags & bctl->meta.target)) {
+                       tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
+               }
+
+               if (tgt) {
+                       spin_unlock(&root->fs_info->balance_lock);
+                       flags = tgt;
+                       goto out;
+               }
+       }
+       spin_unlock(&root->fs_info->balance_lock);
+
        if (num_devices == 1)
                flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
        if (num_devices < 4)
@@ -3059,22 +3175,25 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
        if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
            ((flags & BTRFS_BLOCK_GROUP_RAID1) |
             (flags & BTRFS_BLOCK_GROUP_RAID10) |
-            (flags & BTRFS_BLOCK_GROUP_DUP)))
+            (flags & BTRFS_BLOCK_GROUP_DUP))) {
                flags &= ~BTRFS_BLOCK_GROUP_RAID0;
+       }
+
+out:
+       /* extended -> chunk profile */
+       flags &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
        return flags;
 }
 
 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
 {
        if (flags & BTRFS_BLOCK_GROUP_DATA)
-               flags |= root->fs_info->avail_data_alloc_bits &
-                        root->fs_info->data_alloc_profile;
+               flags |= root->fs_info->avail_data_alloc_bits;
        else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
-               flags |= root->fs_info->avail_system_alloc_bits &
-                        root->fs_info->system_alloc_profile;
+               flags |= root->fs_info->avail_system_alloc_bits;
        else if (flags & BTRFS_BLOCK_GROUP_METADATA)
-               flags |= root->fs_info->avail_metadata_alloc_bits &
-                        root->fs_info->metadata_alloc_profile;
+               flags |= root->fs_info->avail_metadata_alloc_bits;
+
        return btrfs_reduce_alloc_profile(root, flags);
 }
 
@@ -3191,6 +3310,8 @@ commit_trans:
                return -ENOSPC;
        }
        data_sinfo->bytes_may_use += bytes;
+       trace_btrfs_space_reservation(root->fs_info, "space_info",
+                                     (u64)data_sinfo, bytes, 1);
        spin_unlock(&data_sinfo->lock);
 
        return 0;
@@ -3210,6 +3331,8 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
        data_sinfo = BTRFS_I(inode)->space_info;
        spin_lock(&data_sinfo->lock);
        data_sinfo->bytes_may_use -= bytes;
+       trace_btrfs_space_reservation(root->fs_info, "space_info",
+                                     (u64)data_sinfo, bytes, 0);
        spin_unlock(&data_sinfo->lock);
 }
 
@@ -3257,27 +3380,15 @@ static int should_alloc_chunk(struct btrfs_root *root,
                if (num_bytes - num_allocated < thresh)
                        return 1;
        }
-
-       /*
-        * we have two similar checks here, one based on percentage
-        * and once based on a hard number of 256MB.  The idea
-        * is that if we have a good amount of free
-        * room, don't allocate a chunk.  A good mount is
-        * less than 80% utilized of the chunks we have allocated,
-        * or more than 256MB free
-        */
-       if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes)
-               return 0;
-
-       if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
-               return 0;
-
        thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
 
-       /* 256MB or 5% of the FS */
-       thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
+       /* 256MB or 2% of the FS */
+       thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2));
+       /* system chunks need a much small threshold */
+       if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM)
+               thresh = 32 * 1024 * 1024;
 
-       if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
+       if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8))
                return 0;
        return 1;
 }
@@ -3291,7 +3402,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
        int wait_for_alloc = 0;
        int ret = 0;
 
-       flags = btrfs_reduce_alloc_profile(extent_root, flags);
+       BUG_ON(!profile_is_valid(flags, 0));
 
        space_info = __find_space_info(extent_root->fs_info, flags);
        if (!space_info) {
@@ -3582,6 +3693,10 @@ again:
        if (used <= space_info->total_bytes) {
                if (used + orig_bytes <= space_info->total_bytes) {
                        space_info->bytes_may_use += orig_bytes;
+                       trace_btrfs_space_reservation(root->fs_info,
+                                                     "space_info",
+                                                     (u64)space_info,
+                                                     orig_bytes, 1);
                        ret = 0;
                } else {
                        /*
@@ -3649,6 +3764,10 @@ again:
 
                if (used + num_bytes < space_info->total_bytes + avail) {
                        space_info->bytes_may_use += orig_bytes;
+                       trace_btrfs_space_reservation(root->fs_info,
+                                                     "space_info",
+                                                     (u64)space_info,
+                                            &