Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux...
[~shefty/rdma-dev.git] / fs / btrfs / extent-tree.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/pagemap.h>
20 #include <linux/writeback.h>
21 #include <linux/blkdev.h>
22 #include <linux/sort.h>
23 #include <linux/rcupdate.h>
24 #include <linux/kthread.h>
25 #include <linux/slab.h>
26 #include <linux/ratelimit.h>
27 #include "compat.h"
28 #include "hash.h"
29 #include "ctree.h"
30 #include "disk-io.h"
31 #include "print-tree.h"
32 #include "transaction.h"
33 #include "volumes.h"
34 #include "raid56.h"
35 #include "locking.h"
36 #include "free-space-cache.h"
37 #include "math.h"
38
39 #undef SCRAMBLE_DELAYED_REFS
40
41 /*
42  * control flags for do_chunk_alloc's force field
43  * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
44  * if we really need one.
45  *
46  * CHUNK_ALLOC_LIMITED means to only try and allocate one
47  * if we have very few chunks already allocated.  This is
48  * used as part of the clustering code to help make sure
49  * we have a good pool of storage to cluster in, without
50  * filling the FS with empty chunks
51  *
52  * CHUNK_ALLOC_FORCE means it must try to allocate one
53  *
54  */
55 enum {
56         CHUNK_ALLOC_NO_FORCE = 0,
57         CHUNK_ALLOC_LIMITED = 1,
58         CHUNK_ALLOC_FORCE = 2,
59 };
60
61 /*
62  * Control how reservations are dealt with.
63  *
64  * RESERVE_FREE - freeing a reservation.
65  * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
66  *   ENOSPC accounting
67  * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
68  *   bytes_may_use as the ENOSPC accounting is done elsewhere
69  */
70 enum {
71         RESERVE_FREE = 0,
72         RESERVE_ALLOC = 1,
73         RESERVE_ALLOC_NO_ACCOUNT = 2,
74 };
75
76 static int update_block_group(struct btrfs_root *root,
77                               u64 bytenr, u64 num_bytes, int alloc);
78 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
79                                 struct btrfs_root *root,
80                                 u64 bytenr, u64 num_bytes, u64 parent,
81                                 u64 root_objectid, u64 owner_objectid,
82                                 u64 owner_offset, int refs_to_drop,
83                                 struct btrfs_delayed_extent_op *extra_op);
84 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
85                                     struct extent_buffer *leaf,
86                                     struct btrfs_extent_item *ei);
87 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
88                                       struct btrfs_root *root,
89                                       u64 parent, u64 root_objectid,
90                                       u64 flags, u64 owner, u64 offset,
91                                       struct btrfs_key *ins, int ref_mod);
92 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
93                                      struct btrfs_root *root,
94                                      u64 parent, u64 root_objectid,
95                                      u64 flags, struct btrfs_disk_key *key,
96                                      int level, struct btrfs_key *ins);
97 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
98                           struct btrfs_root *extent_root, u64 flags,
99                           int force);
100 static int find_next_key(struct btrfs_path *path, int level,
101                          struct btrfs_key *key);
102 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
103                             int dump_block_groups);
104 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
105                                        u64 num_bytes, int reserve);
106 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
107                                u64 num_bytes);
108
109 static noinline int
110 block_group_cache_done(struct btrfs_block_group_cache *cache)
111 {
112         smp_mb();
113         return cache->cached == BTRFS_CACHE_FINISHED;
114 }
115
116 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
117 {
118         return (cache->flags & bits) == bits;
119 }
120
121 static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
122 {
123         atomic_inc(&cache->count);
124 }
125
126 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
127 {
128         if (atomic_dec_and_test(&cache->count)) {
129                 WARN_ON(cache->pinned > 0);
130                 WARN_ON(cache->reserved > 0);
131                 kfree(cache->free_space_ctl);
132                 kfree(cache);
133         }
134 }
135
136 /*
137  * this adds the block group to the fs_info rb tree for the block group
138  * cache
139  */
140 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
141                                 struct btrfs_block_group_cache *block_group)
142 {
143         struct rb_node **p;
144         struct rb_node *parent = NULL;
145         struct btrfs_block_group_cache *cache;
146
147         spin_lock(&info->block_group_cache_lock);
148         p = &info->block_group_cache_tree.rb_node;
149
150         while (*p) {
151                 parent = *p;
152                 cache = rb_entry(parent, struct btrfs_block_group_cache,
153                                  cache_node);
154                 if (block_group->key.objectid < cache->key.objectid) {
155                         p = &(*p)->rb_left;
156                 } else if (block_group->key.objectid > cache->key.objectid) {
157                         p = &(*p)->rb_right;
158                 } else {
159                         spin_unlock(&info->block_group_cache_lock);
160                         return -EEXIST;
161                 }
162         }
163
164         rb_link_node(&block_group->cache_node, parent, p);
165         rb_insert_color(&block_group->cache_node,
166                         &info->block_group_cache_tree);
167
168         if (info->first_logical_byte > block_group->key.objectid)
169                 info->first_logical_byte = block_group->key.objectid;
170
171         spin_unlock(&info->block_group_cache_lock);
172
173         return 0;
174 }
175
176 /*
177  * This will return the block group at or after bytenr if contains is 0, else
178  * it will return the block group that contains the bytenr
179  */
180 static struct btrfs_block_group_cache *
181 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
182                               int contains)
183 {
184         struct btrfs_block_group_cache *cache, *ret = NULL;
185         struct rb_node *n;
186         u64 end, start;
187
188         spin_lock(&info->block_group_cache_lock);
189         n = info->block_group_cache_tree.rb_node;
190
191         while (n) {
192                 cache = rb_entry(n, struct btrfs_block_group_cache,
193                                  cache_node);
194                 end = cache->key.objectid + cache->key.offset - 1;
195                 start = cache->key.objectid;
196
197                 if (bytenr < start) {
198                         if (!contains && (!ret || start < ret->key.objectid))
199                                 ret = cache;
200                         n = n->rb_left;
201                 } else if (bytenr > start) {
202                         if (contains && bytenr <= end) {
203                                 ret = cache;
204                                 break;
205                         }
206                         n = n->rb_right;
207                 } else {
208                         ret = cache;
209                         break;
210                 }
211         }
212         if (ret) {
213                 btrfs_get_block_group(ret);
214                 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
215                         info->first_logical_byte = ret->key.objectid;
216         }
217         spin_unlock(&info->block_group_cache_lock);
218
219         return ret;
220 }
221
222 static int add_excluded_extent(struct btrfs_root *root,
223                                u64 start, u64 num_bytes)
224 {
225         u64 end = start + num_bytes - 1;
226         set_extent_bits(&root->fs_info->freed_extents[0],
227                         start, end, EXTENT_UPTODATE, GFP_NOFS);
228         set_extent_bits(&root->fs_info->freed_extents[1],
229                         start, end, EXTENT_UPTODATE, GFP_NOFS);
230         return 0;
231 }
232
233 static void free_excluded_extents(struct btrfs_root *root,
234                                   struct btrfs_block_group_cache *cache)
235 {
236         u64 start, end;
237
238         start = cache->key.objectid;
239         end = start + cache->key.offset - 1;
240
241         clear_extent_bits(&root->fs_info->freed_extents[0],
242                           start, end, EXTENT_UPTODATE, GFP_NOFS);
243         clear_extent_bits(&root->fs_info->freed_extents[1],
244                           start, end, EXTENT_UPTODATE, GFP_NOFS);
245 }
246
247 static int exclude_super_stripes(struct btrfs_root *root,
248                                  struct btrfs_block_group_cache *cache)
249 {
250         u64 bytenr;
251         u64 *logical;
252         int stripe_len;
253         int i, nr, ret;
254
255         if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
256                 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
257                 cache->bytes_super += stripe_len;
258                 ret = add_excluded_extent(root, cache->key.objectid,
259                                           stripe_len);
260                 BUG_ON(ret); /* -ENOMEM */
261         }
262
263         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
264                 bytenr = btrfs_sb_offset(i);
265                 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
266                                        cache->key.objectid, bytenr,
267                                        0, &logical, &nr, &stripe_len);
268                 BUG_ON(ret); /* -ENOMEM */
269
270                 while (nr--) {
271                         cache->bytes_super += stripe_len;
272                         ret = add_excluded_extent(root, logical[nr],
273                                                   stripe_len);
274                         BUG_ON(ret); /* -ENOMEM */
275                 }
276
277                 kfree(logical);
278         }
279         return 0;
280 }
281
282 static struct btrfs_caching_control *
283 get_caching_control(struct btrfs_block_group_cache *cache)
284 {
285         struct btrfs_caching_control *ctl;
286
287         spin_lock(&cache->lock);
288         if (cache->cached != BTRFS_CACHE_STARTED) {
289                 spin_unlock(&cache->lock);
290                 return NULL;
291         }
292
293         /* We're loading it the fast way, so we don't have a caching_ctl. */
294         if (!cache->caching_ctl) {
295                 spin_unlock(&cache->lock);
296                 return NULL;
297         }
298
299         ctl = cache->caching_ctl;
300         atomic_inc(&ctl->count);
301         spin_unlock(&cache->lock);
302         return ctl;
303 }
304
305 static void put_caching_control(struct btrfs_caching_control *ctl)
306 {
307         if (atomic_dec_and_test(&ctl->count))
308                 kfree(ctl);
309 }
310
311 /*
312  * this is only called by cache_block_group, since we could have freed extents
313  * we need to check the pinned_extents for any extents that can't be used yet
314  * since their free space will be released as soon as the transaction commits.
315  */
316 static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
317                               struct btrfs_fs_info *info, u64 start, u64 end)
318 {
319         u64 extent_start, extent_end, size, total_added = 0;
320         int ret;
321
322         while (start < end) {
323                 ret = find_first_extent_bit(info->pinned_extents, start,
324                                             &extent_start, &extent_end,
325                                             EXTENT_DIRTY | EXTENT_UPTODATE,
326                                             NULL);
327                 if (ret)
328                         break;
329
330                 if (extent_start <= start) {
331                         start = extent_end + 1;
332                 } else if (extent_start > start && extent_start < end) {
333                         size = extent_start - start;
334                         total_added += size;
335                         ret = btrfs_add_free_space(block_group, start,
336                                                    size);
337                         BUG_ON(ret); /* -ENOMEM or logic error */
338                         start = extent_end + 1;
339                 } else {
340                         break;
341                 }
342         }
343
344         if (start < end) {
345                 size = end - start;
346                 total_added += size;
347                 ret = btrfs_add_free_space(block_group, start, size);
348                 BUG_ON(ret); /* -ENOMEM or logic error */
349         }
350
351         return total_added;
352 }
353
354 static noinline void caching_thread(struct btrfs_work *work)
355 {
356         struct btrfs_block_group_cache *block_group;
357         struct btrfs_fs_info *fs_info;
358         struct btrfs_caching_control *caching_ctl;
359         struct btrfs_root *extent_root;
360         struct btrfs_path *path;
361         struct extent_buffer *leaf;
362         struct btrfs_key key;
363         u64 total_found = 0;
364         u64 last = 0;
365         u32 nritems;
366         int ret = 0;
367
368         caching_ctl = container_of(work, struct btrfs_caching_control, work);
369         block_group = caching_ctl->block_group;
370         fs_info = block_group->fs_info;
371         extent_root = fs_info->extent_root;
372
373         path = btrfs_alloc_path();
374         if (!path)
375                 goto out;
376
377         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
378
379         /*
380          * We don't want to deadlock with somebody trying to allocate a new
381          * extent for the extent root while also trying to search the extent
382          * root to add free space.  So we skip locking and search the commit
383          * root, since its read-only
384          */
385         path->skip_locking = 1;
386         path->search_commit_root = 1;
387         path->reada = 1;
388
389         key.objectid = last;
390         key.offset = 0;
391         key.type = BTRFS_EXTENT_ITEM_KEY;
392 again:
393         mutex_lock(&caching_ctl->mutex);
394         /* need to make sure the commit_root doesn't disappear */
395         down_read(&fs_info->extent_commit_sem);
396
397         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
398         if (ret < 0)
399                 goto err;
400
401         leaf = path->nodes[0];
402         nritems = btrfs_header_nritems(leaf);
403
404         while (1) {
405                 if (btrfs_fs_closing(fs_info) > 1) {
406                         last = (u64)-1;
407                         break;
408                 }
409
410                 if (path->slots[0] < nritems) {
411                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
412                 } else {
413                         ret = find_next_key(path, 0, &key);
414                         if (ret)
415                                 break;
416
417                         if (need_resched() ||
418                             btrfs_next_leaf(extent_root, path)) {
419                                 caching_ctl->progress = last;
420                                 btrfs_release_path(path);
421                                 up_read(&fs_info->extent_commit_sem);
422                                 mutex_unlock(&caching_ctl->mutex);
423                                 cond_resched();
424                                 goto again;
425                         }
426                         leaf = path->nodes[0];
427                         nritems = btrfs_header_nritems(leaf);
428                         continue;
429                 }
430
431                 if (key.objectid < block_group->key.objectid) {
432                         path->slots[0]++;
433                         continue;
434                 }
435
436                 if (key.objectid >= block_group->key.objectid +
437                     block_group->key.offset)
438                         break;
439
440                 if (key.type == BTRFS_EXTENT_ITEM_KEY) {
441                         total_found += add_new_free_space(block_group,
442                                                           fs_info, last,
443                                                           key.objectid);
444                         last = key.objectid + key.offset;
445
446                         if (total_found > (1024 * 1024 * 2)) {
447                                 total_found = 0;
448                                 wake_up(&caching_ctl->wait);
449                         }
450                 }
451                 path->slots[0]++;
452         }
453         ret = 0;
454
455         total_found += add_new_free_space(block_group, fs_info, last,
456                                           block_group->key.objectid +
457                                           block_group->key.offset);
458         caching_ctl->progress = (u64)-1;
459
460         spin_lock(&block_group->lock);
461         block_group->caching_ctl = NULL;
462         block_group->cached = BTRFS_CACHE_FINISHED;
463         spin_unlock(&block_group->lock);
464
465 err:
466         btrfs_free_path(path);
467         up_read(&fs_info->extent_commit_sem);
468
469         free_excluded_extents(extent_root, block_group);
470
471         mutex_unlock(&caching_ctl->mutex);
472 out:
473         wake_up(&caching_ctl->wait);
474
475         put_caching_control(caching_ctl);
476         btrfs_put_block_group(block_group);
477 }
478
479 static int cache_block_group(struct btrfs_block_group_cache *cache,
480                              int load_cache_only)
481 {
482         DEFINE_WAIT(wait);
483         struct btrfs_fs_info *fs_info = cache->fs_info;
484         struct btrfs_caching_control *caching_ctl;
485         int ret = 0;
486
487         caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
488         if (!caching_ctl)
489                 return -ENOMEM;
490
491         INIT_LIST_HEAD(&caching_ctl->list);
492         mutex_init(&caching_ctl->mutex);
493         init_waitqueue_head(&caching_ctl->wait);
494         caching_ctl->block_group = cache;
495         caching_ctl->progress = cache->key.objectid;
496         atomic_set(&caching_ctl->count, 1);
497         caching_ctl->work.func = caching_thread;
498
499         spin_lock(&cache->lock);
500         /*
501          * This should be a rare occasion, but this could happen I think in the
502          * case where one thread starts to load the space cache info, and then
503          * some other thread starts a transaction commit which tries to do an
504          * allocation while the other thread is still loading the space cache
505          * info.  The previous loop should have kept us from choosing this block
506          * group, but if we've moved to the state where we will wait on caching
507          * block groups we need to first check if we're doing a fast load here,
508          * so we can wait for it to finish, otherwise we could end up allocating
509          * from a block group who's cache gets evicted for one reason or
510          * another.
511          */
512         while (cache->cached == BTRFS_CACHE_FAST) {
513                 struct btrfs_caching_control *ctl;
514
515                 ctl = cache->caching_ctl;
516                 atomic_inc(&ctl->count);
517                 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
518                 spin_unlock(&cache->lock);
519
520                 schedule();
521
522                 finish_wait(&ctl->wait, &wait);
523                 put_caching_control(ctl);
524                 spin_lock(&cache->lock);
525         }
526
527         if (cache->cached != BTRFS_CACHE_NO) {
528                 spin_unlock(&cache->lock);
529                 kfree(caching_ctl);
530                 return 0;
531         }
532         WARN_ON(cache->caching_ctl);
533         cache->caching_ctl = caching_ctl;
534         cache->cached = BTRFS_CACHE_FAST;
535         spin_unlock(&cache->lock);
536
537         if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
538                 ret = load_free_space_cache(fs_info, cache);
539
540                 spin_lock(&cache->lock);
541                 if (ret == 1) {
542                         cache->caching_ctl = NULL;
543                         cache->cached = BTRFS_CACHE_FINISHED;
544                         cache->last_byte_to_unpin = (u64)-1;
545                 } else {
546                         if (load_cache_only) {
547                                 cache->caching_ctl = NULL;
548                                 cache->cached = BTRFS_CACHE_NO;
549                         } else {
550                                 cache->cached = BTRFS_CACHE_STARTED;
551                         }
552                 }
553                 spin_unlock(&cache->lock);
554                 wake_up(&caching_ctl->wait);
555                 if (ret == 1) {
556                         put_caching_control(caching_ctl);
557                         free_excluded_extents(fs_info->extent_root, cache);
558                         return 0;
559                 }
560         } else {
561                 /*
562                  * We are not going to do the fast caching, set cached to the
563                  * appropriate value and wakeup any waiters.
564                  */
565                 spin_lock(&cache->lock);
566                 if (load_cache_only) {
567                         cache->caching_ctl = NULL;
568                         cache->cached = BTRFS_CACHE_NO;
569                 } else {
570                         cache->cached = BTRFS_CACHE_STARTED;
571                 }
572                 spin_unlock(&cache->lock);
573                 wake_up(&caching_ctl->wait);
574         }
575
576         if (load_cache_only) {
577                 put_caching_control(caching_ctl);
578                 return 0;
579         }
580
581         down_write(&fs_info->extent_commit_sem);
582         atomic_inc(&caching_ctl->count);
583         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
584         up_write(&fs_info->extent_commit_sem);
585
586         btrfs_get_block_group(cache);
587
588         btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work);
589
590         return ret;
591 }
592
593 /*
594  * return the block group that starts at or after bytenr
595  */
596 static struct btrfs_block_group_cache *
597 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
598 {
599         struct btrfs_block_group_cache *cache;
600
601         cache = block_group_cache_tree_search(info, bytenr, 0);
602
603         return cache;
604 }
605
606 /*
607  * return the block group that contains the given bytenr
608  */
609 struct btrfs_block_group_cache *btrfs_lookup_block_group(
610                                                  struct btrfs_fs_info *info,
611                                                  u64 bytenr)
612 {
613         struct btrfs_block_group_cache *cache;
614
615         cache = block_group_cache_tree_search(info, bytenr, 1);
616
617         return cache;
618 }
619
620 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
621                                                   u64 flags)
622 {
623         struct list_head *head = &info->space_info;
624         struct btrfs_space_info *found;
625
626         flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
627
628         rcu_read_lock();
629         list_for_each_entry_rcu(found, head, list) {
630                 if (found->flags & flags) {
631                         rcu_read_unlock();
632                         return found;
633                 }
634         }
635         rcu_read_unlock();
636         return NULL;
637 }
638
639 /*
640  * after adding space to the filesystem, we need to clear the full flags
641  * on all the space infos.
642  */
643 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
644 {
645         struct list_head *head = &info->space_info;
646         struct btrfs_space_info *found;
647
648         rcu_read_lock();
649         list_for_each_entry_rcu(found, head, list)
650                 found->full = 0;
651         rcu_read_unlock();
652 }
653
654 u64 btrfs_find_block_group(struct btrfs_root *root,
655                            u64 search_start, u64 search_hint, int owner)
656 {
657         struct btrfs_block_group_cache *cache;
658         u64 used;
659         u64 last = max(search_hint, search_start);
660         u64 group_start = 0;
661         int full_search = 0;
662         int factor = 9;
663         int wrapped = 0;
664 again:
665         while (1) {
666                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
667                 if (!cache)
668                         break;
669
670                 spin_lock(&cache->lock);
671                 last = cache->key.objectid + cache->key.offset;
672                 used = btrfs_block_group_used(&cache->item);
673
674                 if ((full_search || !cache->ro) &&
675                     block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
676                         if (used + cache->pinned + cache->reserved <
677                             div_factor(cache->key.offset, factor)) {
678                                 group_start = cache->key.objectid;
679                                 spin_unlock(&cache->lock);
680                                 btrfs_put_block_group(cache);
681                                 goto found;
682                         }
683                 }
684                 spin_unlock(&cache->lock);
685                 btrfs_put_block_group(cache);
686                 cond_resched();
687         }
688         if (!wrapped) {
689                 last = search_start;
690                 wrapped = 1;
691                 goto again;
692         }
693         if (!full_search && factor < 10) {
694                 last = search_start;
695                 full_search = 1;
696                 factor = 10;
697                 goto again;
698         }
699 found:
700         return group_start;
701 }
702
703 /* simple helper to search for an existing extent at a given offset */
704 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
705 {
706         int ret;
707         struct btrfs_key key;
708         struct btrfs_path *path;
709
710         path = btrfs_alloc_path();
711         if (!path)
712                 return -ENOMEM;
713
714         key.objectid = start;
715         key.offset = len;
716         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
717         ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
718                                 0, 0);
719         btrfs_free_path(path);
720         return ret;
721 }
722
723 /*
724  * helper function to lookup reference count and flags of extent.
725  *
726  * the head node for delayed ref is used to store the sum of all the
727  * reference count modifications queued up in the rbtree. the head
728  * node may also store the extent flags to set. This way you can check
729  * to see what the reference count and extent flags would be if all of
730  * the delayed refs are not processed.
731  */
732 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
733                              struct btrfs_root *root, u64 bytenr,
734                              u64 num_bytes, u64 *refs, u64 *flags)
735 {
736         struct btrfs_delayed_ref_head *head;
737         struct btrfs_delayed_ref_root *delayed_refs;
738         struct btrfs_path *path;
739         struct btrfs_extent_item *ei;
740         struct extent_buffer *leaf;
741         struct btrfs_key key;
742         u32 item_size;
743         u64 num_refs;
744         u64 extent_flags;
745         int ret;
746
747         path = btrfs_alloc_path();
748         if (!path)
749                 return -ENOMEM;
750
751         key.objectid = bytenr;
752         key.type = BTRFS_EXTENT_ITEM_KEY;
753         key.offset = num_bytes;
754         if (!trans) {
755                 path->skip_locking = 1;
756                 path->search_commit_root = 1;
757         }
758 again:
759         ret = btrfs_search_slot(trans, root->fs_info->extent_root,
760                                 &key, path, 0, 0);
761         if (ret < 0)
762                 goto out_free;
763
764         if (ret == 0) {
765                 leaf = path->nodes[0];
766                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
767                 if (item_size >= sizeof(*ei)) {
768                         ei = btrfs_item_ptr(leaf, path->slots[0],
769                                             struct btrfs_extent_item);
770                         num_refs = btrfs_extent_refs(leaf, ei);
771                         extent_flags = btrfs_extent_flags(leaf, ei);
772                 } else {
773 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
774                         struct btrfs_extent_item_v0 *ei0;
775                         BUG_ON(item_size != sizeof(*ei0));
776                         ei0 = btrfs_item_ptr(leaf, path->slots[0],
777                                              struct btrfs_extent_item_v0);
778                         num_refs = btrfs_extent_refs_v0(leaf, ei0);
779                         /* FIXME: this isn't correct for data */
780                         extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
781 #else
782                         BUG();
783 #endif
784                 }
785                 BUG_ON(num_refs == 0);
786         } else {
787                 num_refs = 0;
788                 extent_flags = 0;
789                 ret = 0;
790         }
791
792         if (!trans)
793                 goto out;
794
795         delayed_refs = &trans->transaction->delayed_refs;
796         spin_lock(&delayed_refs->lock);
797         head = btrfs_find_delayed_ref_head(trans, bytenr);
798         if (head) {
799                 if (!mutex_trylock(&head->mutex)) {
800                         atomic_inc(&head->node.refs);
801                         spin_unlock(&delayed_refs->lock);
802
803                         btrfs_release_path(path);
804
805                         /*
806                          * Mutex was contended, block until it's released and try
807                          * again
808                          */
809                         mutex_lock(&head->mutex);
810                         mutex_unlock(&head->mutex);
811                         btrfs_put_delayed_ref(&head->node);
812                         goto again;
813                 }
814                 if (head->extent_op && head->extent_op->update_flags)
815                         extent_flags |= head->extent_op->flags_to_set;
816                 else
817                         BUG_ON(num_refs == 0);
818
819                 num_refs += head->node.ref_mod;
820                 mutex_unlock(&head->mutex);
821         }
822         spin_unlock(&delayed_refs->lock);
823 out:
824         WARN_ON(num_refs == 0);
825         if (refs)
826                 *refs = num_refs;
827         if (flags)
828                 *flags = extent_flags;
829 out_free:
830         btrfs_free_path(path);
831         return ret;
832 }
833
834 /*
835  * Back reference rules.  Back refs have three main goals:
836  *
837  * 1) differentiate between all holders of references to an extent so that
838  *    when a reference is dropped we can make sure it was a valid reference
839  *    before freeing the extent.
840  *
841  * 2) Provide enough information to quickly find the holders of an extent
842  *    if we notice a given block is corrupted or bad.
843  *
844  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
845  *    maintenance.  This is actually the same as #2, but with a slightly
846  *    different use case.
847  *
848  * There are two kinds of back refs. The implicit back refs is optimized
849  * for pointers in non-shared tree blocks. For a given pointer in a block,
850  * back refs of this kind provide information about the block's owner tree
851  * and the pointer's key. These information allow us to find the block by
852  * b-tree searching. The full back refs is for pointers in tree blocks not
853  * referenced by their owner trees. The location of tree block is recorded
854  * in the back refs. Actually the full back refs is generic, and can be
855  * used in all cases the implicit back refs is used. The major shortcoming
856  * of the full back refs is its overhead. Every time a tree block gets
857  * COWed, we have to update back refs entry for all pointers in it.
858  *
859  * For a newly allocated tree block, we use implicit back refs for
860  * pointers in it. This means most tree related operations only involve
861  * implicit back refs. For a tree block created in old transaction, the
862  * only way to drop a reference to it is COW it. So we can detect the
863  * event that tree block loses its owner tree's reference and do the
864  * back refs conversion.
865  *
866  * When a tree block is COW'd through a tree, there are four cases:
867  *
868  * The reference count of the block is one and the tree is the block's
869  * owner tree. Nothing to do in this case.
870  *
871  * The reference count of the block is one and the tree is not the
872  * block's owner tree. In this case, full back refs is used for pointers
873  * in the block. Remove these full back refs, add implicit back refs for
874  * every pointers in the new block.
875  *
876  * The reference count of the block is greater than one and the tree is
877  * the block's owner tree. In this case, implicit back refs is used for
878  * pointers in the block. Add full back refs for every pointers in the
879  * block, increase lower level extents' reference counts. The original
880  * implicit back refs are entailed to the new block.
881  *
882  * The reference count of the block is greater than one and the tree is
883  * not the block's owner tree. Add implicit back refs for every pointer in
884  * the new block, increase lower level extents' reference count.
885  *
886  * Back Reference Key composing:
887  *
888  * The key objectid corresponds to the first byte in the extent,
889  * The key type is used to differentiate between types of back refs.
890  * There are different meanings of the key offset for different types
891  * of back refs.
892  *
893  * File extents can be referenced by:
894  *
895  * - multiple snapshots, subvolumes, or different generations in one subvol
896  * - different files inside a single subvolume
897  * - different offsets inside a file (bookend extents in file.c)
898  *
899  * The extent ref structure for the implicit back refs has fields for:
900  *
901  * - Objectid of the subvolume root
902  * - objectid of the file holding the reference
903  * - original offset in the file
904  * - how many bookend extents
905  *
906  * The key offset for the implicit back refs is hash of the first
907  * three fields.
908  *
909  * The extent ref structure for the full back refs has field for:
910  *
911  * - number of pointers in the tree leaf
912  *
913  * The key offset for the implicit back refs is the first byte of
914  * the tree leaf
915  *
916  * When a file extent is allocated, The implicit back refs is used.
917  * the fields are filled in:
918  *
919  *     (root_key.objectid, inode objectid, offset in file, 1)
920  *
921  * When a file extent is removed file truncation, we find the
922  * corresponding implicit back refs and check the following fields:
923  *
924  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
925  *
926  * Btree extents can be referenced by:
927  *
928  * - Different subvolumes
929  *
930  * Both the implicit back refs and the full back refs for tree blocks
931  * only consist of key. The key offset for the implicit back refs is
932  * objectid of block's owner tree. The key offset for the full back refs
933  * is the first byte of parent block.
934  *
935  * When implicit back refs is used, information about the lowest key and
936  * level of the tree block are required. These information are stored in
937  * tree block info structure.
938  */
939
940 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
941 static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
942                                   struct btrfs_root *root,
943                                   struct btrfs_path *path,
944                                   u64 owner, u32 extra_size)
945 {
946         struct btrfs_extent_item *item;
947         struct btrfs_extent_item_v0 *ei0;
948         struct btrfs_extent_ref_v0 *ref0;
949         struct btrfs_tree_block_info *bi;
950         struct extent_buffer *leaf;
951         struct btrfs_key key;
952         struct btrfs_key found_key;
953         u32 new_size = sizeof(*item);
954         u64 refs;
955         int ret;
956
957         leaf = path->nodes[0];
958         BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
959
960         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
961         ei0 = btrfs_item_ptr(leaf, path->slots[0],
962                              struct btrfs_extent_item_v0);
963         refs = btrfs_extent_refs_v0(leaf, ei0);
964
965         if (owner == (u64)-1) {
966                 while (1) {
967                         if (path->slots[0] >= btrfs_header_nritems(leaf)) {
968                                 ret = btrfs_next_leaf(root, path);
969                                 if (ret < 0)
970                                         return ret;
971                                 BUG_ON(ret > 0); /* Corruption */
972                                 leaf = path->nodes[0];
973                         }
974                         btrfs_item_key_to_cpu(leaf, &found_key,
975                                               path->slots[0]);
976                         BUG_ON(key.objectid != found_key.objectid);
977                         if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
978                                 path->slots[0]++;
979                                 continue;
980                         }
981                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
982                                               struct btrfs_extent_ref_v0);
983                         owner = btrfs_ref_objectid_v0(leaf, ref0);
984                         break;
985                 }
986         }
987         btrfs_release_path(path);
988
989         if (owner < BTRFS_FIRST_FREE_OBJECTID)
990                 new_size += sizeof(*bi);
991
992         new_size -= sizeof(*ei0);
993         ret = btrfs_search_slot(trans, root, &key, path,
994                                 new_size + extra_size, 1);
995         if (ret < 0)
996                 return ret;
997         BUG_ON(ret); /* Corruption */
998
999         btrfs_extend_item(trans, root, path, new_size);
1000
1001         leaf = path->nodes[0];
1002         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1003         btrfs_set_extent_refs(leaf, item, refs);
1004         /* FIXME: get real generation */
1005         btrfs_set_extent_generation(leaf, item, 0);
1006         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1007                 btrfs_set_extent_flags(leaf, item,
1008                                        BTRFS_EXTENT_FLAG_TREE_BLOCK |
1009                                        BTRFS_BLOCK_FLAG_FULL_BACKREF);
1010                 bi = (struct btrfs_tree_block_info *)(item + 1);
1011                 /* FIXME: get first key of the block */
1012                 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
1013                 btrfs_set_tree_block_level(leaf, bi, (int)owner);
1014         } else {
1015                 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
1016         }
1017         btrfs_mark_buffer_dirty(leaf);
1018         return 0;
1019 }
1020 #endif
1021
1022 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1023 {
1024         u32 high_crc = ~(u32)0;
1025         u32 low_crc = ~(u32)0;
1026         __le64 lenum;
1027
1028         lenum = cpu_to_le64(root_objectid);
1029         high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
1030         lenum = cpu_to_le64(owner);
1031         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1032         lenum = cpu_to_le64(offset);
1033         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1034
1035         return ((u64)high_crc << 31) ^ (u64)low_crc;
1036 }
1037
1038 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1039                                      struct btrfs_extent_data_ref *ref)
1040 {
1041         return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1042                                     btrfs_extent_data_ref_objectid(leaf, ref),
1043                                     btrfs_extent_data_ref_offset(leaf, ref));
1044 }
1045
1046 static int match_extent_data_ref(struct extent_buffer *leaf,
1047                                  struct btrfs_extent_data_ref *ref,
1048                                  u64 root_objectid, u64 owner, u64 offset)
1049 {
1050         if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1051             btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1052             btrfs_extent_data_ref_offset(leaf, ref) != offset)
1053                 return 0;
1054         return 1;
1055 }
1056
1057 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1058                                            struct btrfs_root *root,
1059                                            struct btrfs_path *path,
1060                                            u64 bytenr, u64 parent,
1061                                            u64 root_objectid,
1062                                            u64 owner, u64 offset)
1063 {
1064         struct btrfs_key key;
1065         struct btrfs_extent_data_ref *ref;
1066         struct extent_buffer *leaf;
1067         u32 nritems;
1068         int ret;
1069         int recow;
1070         int err = -ENOENT;
1071
1072         key.objectid = bytenr;
1073         if (parent) {
1074                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1075                 key.offset = parent;
1076         } else {
1077                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1078                 key.offset = hash_extent_data_ref(root_objectid,
1079                                                   owner, offset);
1080         }
1081 again:
1082         recow = 0;
1083         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1084         if (ret < 0) {
1085                 err = ret;
1086                 goto fail;
1087         }
1088
1089         if (parent) {
1090                 if (!ret)
1091                         return 0;
1092 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1093                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1094                 btrfs_release_path(path);
1095                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1096                 if (ret < 0) {
1097                         err = ret;
1098                         goto fail;
1099                 }
1100                 if (!ret)
1101                         return 0;
1102 #endif
1103                 goto fail;
1104         }
1105
1106         leaf = path->nodes[0];
1107         nritems = btrfs_header_nritems(leaf);
1108         while (1) {
1109                 if (path->slots[0] >= nritems) {
1110                         ret = btrfs_next_leaf(root, path);
1111                         if (ret < 0)
1112                                 err = ret;
1113                         if (ret)
1114                                 goto fail;
1115
1116                         leaf = path->nodes[0];
1117                         nritems = btrfs_header_nritems(leaf);
1118                         recow = 1;
1119                 }
1120
1121                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1122                 if (key.objectid != bytenr ||
1123                     key.type != BTRFS_EXTENT_DATA_REF_KEY)
1124                         goto fail;
1125
1126                 ref = btrfs_item_ptr(leaf, path->slots[0],
1127                                      struct btrfs_extent_data_ref);
1128
1129                 if (match_extent_data_ref(leaf, ref, root_objectid,
1130                                           owner, offset)) {
1131                         if (recow) {
1132                                 btrfs_release_path(path);
1133                                 goto again;
1134                         }
1135                         err = 0;
1136                         break;
1137                 }
1138                 path->slots[0]++;
1139         }
1140 fail:
1141         return err;
1142 }
1143
1144 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1145                                            struct btrfs_root *root,
1146                                            struct btrfs_path *path,
1147                                            u64 bytenr, u64 parent,
1148                                            u64 root_objectid, u64 owner,
1149                                            u64 offset, int refs_to_add)
1150 {
1151         struct btrfs_key key;
1152         struct extent_buffer *leaf;
1153         u32 size;
1154         u32 num_refs;
1155         int ret;
1156
1157         key.objectid = bytenr;
1158         if (parent) {
1159                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1160                 key.offset = parent;
1161                 size = sizeof(struct btrfs_shared_data_ref);
1162         } else {
1163                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1164                 key.offset = hash_extent_data_ref(root_objectid,
1165                                                   owner, offset);
1166                 size = sizeof(struct btrfs_extent_data_ref);
1167         }
1168
1169         ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1170         if (ret && ret != -EEXIST)
1171                 goto fail;
1172
1173         leaf = path->nodes[0];
1174         if (parent) {
1175                 struct btrfs_shared_data_ref *ref;
1176                 ref = btrfs_item_ptr(leaf, path->slots[0],
1177                                      struct btrfs_shared_data_ref);
1178                 if (ret == 0) {
1179                         btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1180                 } else {
1181                         num_refs = btrfs_shared_data_ref_count(leaf, ref);
1182                         num_refs += refs_to_add;
1183                         btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1184                 }
1185         } else {
1186                 struct btrfs_extent_data_ref *ref;
1187                 while (ret == -EEXIST) {
1188                         ref = btrfs_item_ptr(leaf, path->slots[0],
1189                                              struct btrfs_extent_data_ref);
1190                         if (match_extent_data_ref(leaf, ref, root_objectid,
1191                                                   owner, offset))
1192                                 break;
1193                         btrfs_release_path(path);
1194                         key.offset++;
1195                         ret = btrfs_insert_empty_item(trans, root, path, &key,
1196                                                       size);
1197                         if (ret && ret != -EEXIST)
1198                                 goto fail;
1199
1200                         leaf = path->nodes[0];
1201                 }
1202                 ref = btrfs_item_ptr(leaf, path->slots[0],
1203                                      struct btrfs_extent_data_ref);
1204                 if (ret == 0) {
1205                         btrfs_set_extent_data_ref_root(leaf, ref,
1206                                                        root_objectid);
1207                         btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1208                         btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1209                         btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1210                 } else {
1211                         num_refs = btrfs_extent_data_ref_count(leaf, ref);
1212                         num_refs += refs_to_add;
1213                         btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1214                 }
1215         }
1216         btrfs_mark_buffer_dirty(leaf);
1217         ret = 0;
1218 fail:
1219         btrfs_release_path(path);
1220         return ret;
1221 }
1222
1223 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1224                                            struct btrfs_root *root,
1225                                            struct btrfs_path *path,
1226                                            int refs_to_drop)
1227 {
1228         struct btrfs_key key;
1229         struct btrfs_extent_data_ref *ref1 = NULL;
1230         struct btrfs_shared_data_ref *ref2 = NULL;
1231         struct extent_buffer *leaf;
1232         u32 num_refs = 0;
1233         int ret = 0;
1234
1235         leaf = path->nodes[0];
1236         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1237
1238         if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1239                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1240                                       struct btrfs_extent_data_ref);
1241                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1242         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1243                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1244                                       struct btrfs_shared_data_ref);
1245                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1246 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1247         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1248                 struct btrfs_extent_ref_v0 *ref0;
1249                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1250                                       struct btrfs_extent_ref_v0);
1251                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1252 #endif
1253         } else {
1254                 BUG();
1255         }
1256
1257         BUG_ON(num_refs < refs_to_drop);
1258         num_refs -= refs_to_drop;
1259
1260         if (num_refs == 0) {
1261                 ret = btrfs_del_item(trans, root, path);
1262         } else {
1263                 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1264                         btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1265                 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1266                         btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1267 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1268                 else {
1269                         struct btrfs_extent_ref_v0 *ref0;
1270                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
1271                                         struct btrfs_extent_ref_v0);
1272                         btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1273                 }
1274 #endif
1275                 btrfs_mark_buffer_dirty(leaf);
1276         }
1277         return ret;
1278 }
1279
1280 static noinline u32 extent_data_ref_count(struct btrfs_root *root,
1281                                           struct btrfs_path *path,
1282                                           struct btrfs_extent_inline_ref *iref)
1283 {
1284         struct btrfs_key key;
1285         struct extent_buffer *leaf;
1286         struct btrfs_extent_data_ref *ref1;
1287         struct btrfs_shared_data_ref *ref2;
1288         u32 num_refs = 0;
1289
1290         leaf = path->nodes[0];
1291         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1292         if (iref) {
1293                 if (btrfs_extent_inline_ref_type(leaf, iref) ==
1294                     BTRFS_EXTENT_DATA_REF_KEY) {
1295                         ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1296                         num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1297                 } else {
1298                         ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1299                         num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1300                 }
1301         } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1302                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1303                                       struct btrfs_extent_data_ref);
1304                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1305         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1306                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1307                                       struct btrfs_shared_data_ref);
1308                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1309 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1310         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1311                 struct btrfs_extent_ref_v0 *ref0;
1312                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1313                                       struct btrfs_extent_ref_v0);
1314                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1315 #endif
1316         } else {
1317                 WARN_ON(1);
1318         }
1319         return num_refs;
1320 }
1321
1322 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1323                                           struct btrfs_root *root,
1324                                           struct btrfs_path *path,
1325                                           u64 bytenr, u64 parent,
1326                                           u64 root_objectid)
1327 {
1328         struct btrfs_key key;
1329         int ret;
1330
1331         key.objectid = bytenr;
1332         if (parent) {
1333                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1334                 key.offset = parent;
1335         } else {
1336                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1337                 key.offset = root_objectid;
1338         }
1339
1340         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1341         if (ret > 0)
1342                 ret = -ENOENT;
1343 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1344         if (ret == -ENOENT && parent) {
1345                 btrfs_release_path(path);
1346                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1347                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1348                 if (ret > 0)
1349                         ret = -ENOENT;
1350         }
1351 #endif
1352         return ret;
1353 }
1354
1355 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1356                                           struct btrfs_root *root,
1357                                           struct btrfs_path *path,
1358                                           u64 bytenr, u64 parent,
1359                                           u64 root_objectid)
1360 {
1361         struct btrfs_key key;
1362         int ret;
1363
1364         key.objectid = bytenr;
1365         if (parent) {
1366                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1367                 key.offset = parent;
1368         } else {
1369                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1370                 key.offset = root_objectid;
1371         }
1372
1373         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1374         btrfs_release_path(path);
1375         return ret;
1376 }
1377
1378 static inline int extent_ref_type(u64 parent, u64 owner)
1379 {
1380         int type;
1381         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1382                 if (parent > 0)
1383                         type = BTRFS_SHARED_BLOCK_REF_KEY;
1384                 else
1385                         type = BTRFS_TREE_BLOCK_REF_KEY;
1386         } else {
1387                 if (parent > 0)
1388                         type = BTRFS_SHARED_DATA_REF_KEY;
1389                 else
1390                         type = BTRFS_EXTENT_DATA_REF_KEY;
1391         }
1392         return type;
1393 }
1394
1395 static int find_next_key(struct btrfs_path *path, int level,
1396                          struct btrfs_key *key)
1397
1398 {
1399         for (; level < BTRFS_MAX_LEVEL; level++) {
1400                 if (!path->nodes[level])
1401                         break;
1402                 if (path->slots[level] + 1 >=
1403                     btrfs_header_nritems(path->nodes[level]))
1404                         continue;
1405                 if (level == 0)
1406                         btrfs_item_key_to_cpu(path->nodes[level], key,
1407                                               path->slots[level] + 1);
1408                 else
1409                         btrfs_node_key_to_cpu(path->nodes[level], key,
1410                                               path->slots[level] + 1);
1411                 return 0;
1412         }
1413         return 1;
1414 }
1415
1416 /*
1417  * look for inline back ref. if back ref is found, *ref_ret is set
1418  * to the address of inline back ref, and 0 is returned.
1419  *
1420  * if back ref isn't found, *ref_ret is set to the address where it
1421  * should be inserted, and -ENOENT is returned.
1422  *
1423  * if insert is true and there are too many inline back refs, the path
1424  * points to the extent item, and -EAGAIN is returned.
1425  *
1426  * NOTE: inline back refs are ordered in the same way that back ref
1427  *       items in the tree are ordered.
1428  */
1429 static noinline_for_stack
1430 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1431                                  struct btrfs_root *root,
1432                                  struct btrfs_path *path,
1433                                  struct btrfs_extent_inline_ref **ref_ret,
1434                                  u64 bytenr, u64 num_bytes,
1435                                  u64 parent, u64 root_objectid,
1436                                  u64 owner, u64 offset, int insert)
1437 {
1438         struct btrfs_key key;
1439         struct extent_buffer *leaf;
1440         struct btrfs_extent_item *ei;
1441         struct btrfs_extent_inline_ref *iref;
1442         u64 flags;
1443         u64 item_size;
1444         unsigned long ptr;
1445         unsigned long end;
1446         int extra_size;
1447         int type;
1448         int want;
1449         int ret;
1450         int err = 0;
1451
1452         key.objectid = bytenr;
1453         key.type = BTRFS_EXTENT_ITEM_KEY;
1454         key.offset = num_bytes;
1455
1456         want = extent_ref_type(parent, owner);
1457         if (insert) {
1458                 extra_size = btrfs_extent_inline_ref_size(want);
1459                 path->keep_locks = 1;
1460         } else
1461                 extra_size = -1;
1462         ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1463         if (ret < 0) {
1464                 err = ret;
1465                 goto out;
1466         }
1467         if (ret && !insert) {
1468                 err = -ENOENT;
1469                 goto out;
1470         } else if (ret) {
1471                 err = -EIO;
1472                 WARN_ON(1);
1473                 goto out;
1474         }
1475
1476         leaf = path->nodes[0];
1477         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1478 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1479         if (item_size < sizeof(*ei)) {
1480                 if (!insert) {
1481                         err = -ENOENT;
1482                         goto out;
1483                 }
1484                 ret = convert_extent_item_v0(trans, root, path, owner,
1485                                              extra_size);
1486                 if (ret < 0) {
1487                         err = ret;
1488                         goto out;
1489                 }
1490                 leaf = path->nodes[0];
1491                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1492         }
1493 #endif
1494         BUG_ON(item_size < sizeof(*ei));
1495
1496         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1497         flags = btrfs_extent_flags(leaf, ei);
1498
1499         ptr = (unsigned long)(ei + 1);
1500         end = (unsigned long)ei + item_size;
1501
1502         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1503                 ptr += sizeof(struct btrfs_tree_block_info);
1504                 BUG_ON(ptr > end);
1505         } else {
1506                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
1507         }
1508
1509         err = -ENOENT;
1510         while (1) {
1511                 if (ptr >= end) {
1512                         WARN_ON(ptr > end);
1513                         break;
1514                 }
1515                 iref = (struct btrfs_extent_inline_ref *)ptr;
1516                 type = btrfs_extent_inline_ref_type(leaf, iref);
1517                 if (want < type)
1518                         break;
1519                 if (want > type) {
1520                         ptr += btrfs_extent_inline_ref_size(type);
1521                         continue;
1522                 }
1523
1524                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1525                         struct btrfs_extent_data_ref *dref;
1526                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1527                         if (match_extent_data_ref(leaf, dref, root_objectid,
1528                                                   owner, offset)) {
1529                                 err = 0;
1530                                 break;
1531                         }
1532                         if (hash_extent_data_ref_item(leaf, dref) <
1533                             hash_extent_data_ref(root_objectid, owner, offset))
1534                                 break;
1535                 } else {
1536                         u64 ref_offset;
1537                         ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1538                         if (parent > 0) {
1539                                 if (parent == ref_offset) {
1540                                         err = 0;
1541                                         break;
1542                                 }
1543                                 if (ref_offset < parent)
1544                                         break;
1545                         } else {
1546                                 if (root_objectid == ref_offset) {
1547                                         err = 0;
1548                                         break;
1549                                 }
1550                                 if (ref_offset < root_objectid)
1551                                         break;
1552                         }
1553                 }
1554                 ptr += btrfs_extent_inline_ref_size(type);
1555         }
1556         if (err == -ENOENT && insert) {
1557                 if (item_size + extra_size >=
1558                     BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1559                         err = -EAGAIN;
1560                         goto out;
1561                 }
1562                 /*
1563                  * To add new inline back ref, we have to make sure
1564                  * there is no corresponding back ref item.
1565                  * For simplicity, we just do not add new inline back
1566                  * ref if there is any kind of item for this block
1567                  */
1568                 if (find_next_key(path, 0, &key) == 0 &&
1569                     key.objectid == bytenr &&
1570                     key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1571                         err = -EAGAIN;
1572                         goto out;
1573                 }
1574         }
1575         *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1576 out:
1577         if (insert) {
1578                 path->keep_locks = 0;
1579                 btrfs_unlock_up_safe(path, 1);
1580         }
1581         return err;
1582 }
1583
1584 /*
1585  * helper to add new inline back ref
1586  */
1587 static noinline_for_stack
1588 void setup_inline_extent_backref(struct btrfs_trans_handle *trans,
1589                                  struct btrfs_root *root,
1590                                  struct btrfs_path *path,
1591                                  struct btrfs_extent_inline_ref *iref,
1592                                  u64 parent, u64 root_objectid,
1593                                  u64 owner, u64 offset, int refs_to_add,
1594                                  struct btrfs_delayed_extent_op *extent_op)
1595 {
1596         struct extent_buffer *leaf;
1597         struct btrfs_extent_item *ei;
1598         unsigned long ptr;
1599         unsigned long end;
1600         unsigned long item_offset;
1601         u64 refs;
1602         int size;
1603         int type;
1604
1605         leaf = path->nodes[0];
1606         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1607         item_offset = (unsigned long)iref - (unsigned long)ei;
1608
1609         type = extent_ref_type(parent, owner);
1610         size = btrfs_extent_inline_ref_size(type);
1611
1612         btrfs_extend_item(trans, root, path, size);
1613
1614         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1615         refs = btrfs_extent_refs(leaf, ei);
1616         refs += refs_to_add;
1617         btrfs_set_extent_refs(leaf, ei, refs);
1618         if (extent_op)
1619                 __run_delayed_extent_op(extent_op, leaf, ei);
1620
1621         ptr = (unsigned long)ei + item_offset;
1622         end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1623         if (ptr < end - size)
1624                 memmove_extent_buffer(leaf, ptr + size, ptr,
1625                                       end - size - ptr);
1626
1627         iref = (struct btrfs_extent_inline_ref *)ptr;
1628         btrfs_set_extent_inline_ref_type(leaf, iref, type);
1629         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1630                 struct btrfs_extent_data_ref *dref;
1631                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1632                 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1633                 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1634                 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1635                 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1636         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1637                 struct btrfs_shared_data_ref *sref;
1638                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1639                 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1640                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1641         } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1642                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1643         } else {
1644                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1645         }
1646         btrfs_mark_buffer_dirty(leaf);
1647 }
1648
1649 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1650                                  struct btrfs_root *root,
1651                                  struct btrfs_path *path,
1652                                  struct btrfs_extent_inline_ref **ref_ret,
1653                                  u64 bytenr, u64 num_bytes, u64 parent,
1654                                  u64 root_objectid, u64 owner, u64 offset)
1655 {
1656         int ret;
1657
1658         ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1659                                            bytenr, num_bytes, parent,
1660                                            root_objectid, owner, offset, 0);
1661         if (ret != -ENOENT)
1662                 return ret;
1663
1664         btrfs_release_path(path);
1665         *ref_ret = NULL;
1666
1667         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1668                 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1669                                             root_objectid);
1670         } else {
1671                 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1672                                              root_objectid, owner, offset);
1673         }
1674         return ret;
1675 }
1676
1677 /*
1678  * helper to update/remove inline back ref
1679  */
1680 static noinline_for_stack
1681 void update_inline_extent_backref(struct btrfs_trans_handle *trans,
1682                                   struct btrfs_root *root,
1683                                   struct btrfs_path *path,
1684                                   struct btrfs_extent_inline_ref *iref,
1685                                   int refs_to_mod,
1686                                   struct btrfs_delayed_extent_op *extent_op)
1687 {
1688         struct extent_buffer *leaf;
1689         struct btrfs_extent_item *ei;
1690         struct btrfs_extent_data_ref *dref = NULL;
1691         struct btrfs_shared_data_ref *sref = NULL;
1692         unsigned long ptr;
1693         unsigned long end;
1694         u32 item_size;
1695         int size;
1696         int type;
1697         u64 refs;
1698
1699         leaf = path->nodes[0];
1700         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1701         refs = btrfs_extent_refs(leaf, ei);
1702         WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1703         refs += refs_to_mod;
1704         btrfs_set_extent_refs(leaf, ei, refs);
1705         if (extent_op)
1706                 __run_delayed_extent_op(extent_op, leaf, ei);
1707
1708         type = btrfs_extent_inline_ref_type(leaf, iref);
1709
1710         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1711                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1712                 refs = btrfs_extent_data_ref_count(leaf, dref);
1713         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1714                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1715                 refs = btrfs_shared_data_ref_count(leaf, sref);
1716         } else {
1717                 refs = 1;
1718                 BUG_ON(refs_to_mod != -1);
1719         }
1720
1721         BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1722         refs += refs_to_mod;
1723
1724         if (refs > 0) {
1725                 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1726                         btrfs_set_extent_data_ref_count(leaf, dref, refs);
1727                 else
1728                         btrfs_set_shared_data_ref_count(leaf, sref, refs);
1729         } else {
1730                 size =  btrfs_extent_inline_ref_size(type);
1731                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1732                 ptr = (unsigned long)iref;
1733                 end = (unsigned long)ei + item_size;
1734                 if (ptr + size < end)
1735                         memmove_extent_buffer(leaf, ptr, ptr + size,
1736                                               end - ptr - size);
1737                 item_size -= size;
1738                 btrfs_truncate_item(trans, root, path, item_size, 1);
1739         }
1740         btrfs_mark_buffer_dirty(leaf);
1741 }
1742
1743 static noinline_for_stack
1744 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1745                                  struct btrfs_root *root,
1746                                  struct btrfs_path *path,
1747                                  u64 bytenr, u64 num_bytes, u64 parent,
1748                                  u64 root_objectid, u64 owner,
1749                                  u64 offset, int refs_to_add,
1750                                  struct btrfs_delayed_extent_op *extent_op)
1751 {
1752         struct btrfs_extent_inline_ref *iref;
1753         int ret;
1754
1755         ret = lookup_inline_extent_backref(trans, root, path, &iref,
1756                                            bytenr, num_bytes, parent,
1757                                            root_objectid, owner, offset, 1);
1758         if (ret == 0) {
1759                 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1760                 update_inline_extent_backref(trans, root, path, iref,
1761                                              refs_to_add, extent_op);
1762         } else if (ret == -ENOENT) {
1763                 setup_inline_extent_backref(trans, root, path, iref, parent,
1764                                             root_objectid, owner, offset,
1765                                             refs_to_add, extent_op);
1766                 ret = 0;
1767         }
1768         return ret;
1769 }
1770
1771 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1772                                  struct btrfs_root *root,
1773                                  struct btrfs_path *path,
1774                                  u64 bytenr, u64 parent, u64 root_objectid,
1775                                  u64 owner, u64 offset, int refs_to_add)
1776 {
1777         int ret;
1778         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1779                 BUG_ON(refs_to_add != 1);
1780                 ret = insert_tree_block_ref(trans, root, path, bytenr,
1781                                             parent, root_objectid);
1782         } else {
1783                 ret = insert_extent_data_ref(trans, root, path, bytenr,
1784                                              parent, root_objectid,
1785                                              owner, offset, refs_to_add);
1786         }
1787         return ret;
1788 }
1789
1790 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1791                                  struct btrfs_root *root,
1792                                  struct btrfs_path *path,
1793                                  struct btrfs_extent_inline_ref *iref,
1794                                  int refs_to_drop, int is_data)
1795 {
1796         int ret = 0;
1797
1798         BUG_ON(!is_data && refs_to_drop != 1);
1799         if (iref) {
1800                 update_inline_extent_backref(trans, root, path, iref,
1801                                              -refs_to_drop, NULL);
1802         } else if (is_data) {
1803                 ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1804         } else {
1805                 ret = btrfs_del_item(trans, root, path);
1806         }
1807         return ret;
1808 }
1809
1810 static int btrfs_issue_discard(struct block_device *bdev,
1811                                 u64 start, u64 len)
1812 {
1813         return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
1814 }
1815
1816 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1817                                 u64 num_bytes, u64 *actual_bytes)
1818 {
1819         int ret;
1820         u64 discarded_bytes = 0;
1821         struct btrfs_bio *bbio = NULL;
1822
1823
1824         /* Tell the block device(s) that the sectors can be discarded */
1825         ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
1826                               bytenr, &num_bytes, &bbio, 0);
1827         /* Error condition is -ENOMEM */
1828         if (!ret) {
1829                 struct btrfs_bio_stripe *stripe = bbio->stripes;
1830                 int i;
1831
1832
1833                 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1834                         if (!stripe->dev->can_discard)
1835                                 continue;
1836
1837                         ret = btrfs_issue_discard(stripe->dev->bdev,
1838                                                   stripe->physical,
1839                                                   stripe->length);
1840                         if (!ret)
1841                                 discarded_bytes += stripe->length;
1842                         else if (ret != -EOPNOTSUPP)
1843                                 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
1844
1845                         /*
1846                          * Just in case we get back EOPNOTSUPP for some reason,
1847                          * just ignore the return value so we don't screw up
1848                          * people calling discard_extent.
1849                          */
1850                         ret = 0;
1851                 }
1852                 kfree(bbio);
1853         }
1854
1855         if (actual_bytes)
1856                 *actual_bytes = discarded_bytes;
1857
1858
1859         if (ret == -EOPNOTSUPP)
1860                 ret = 0;
1861         return ret;
1862 }
1863
1864 /* Can return -ENOMEM */
1865 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1866                          struct btrfs_root *root,
1867                          u64 bytenr, u64 num_bytes, u64 parent,
1868                          u64 root_objectid, u64 owner, u64 offset, int for_cow)
1869 {
1870         int ret;
1871         struct btrfs_fs_info *fs_info = root->fs_info;
1872
1873         BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1874                root_objectid == BTRFS_TREE_LOG_OBJECTID);
1875
1876         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1877                 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
1878                                         num_bytes,
1879                                         parent, root_objectid, (int)owner,
1880                                         BTRFS_ADD_DELAYED_REF, NULL, for_cow);
1881         } else {
1882                 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
1883                                         num_bytes,
1884                                         parent, root_objectid, owner, offset,
1885                                         BTRFS_ADD_DELAYED_REF, NULL, for_cow);
1886         }
1887         return ret;
1888 }
1889
1890 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1891                                   struct btrfs_root *root,
1892                                   u64 bytenr, u64 num_bytes,
1893                                   u64 parent, u64 root_objectid,
1894                                   u64 owner, u64 offset, int refs_to_add,
1895                                   struct btrfs_delayed_extent_op *extent_op)
1896 {
1897         struct btrfs_path *path;
1898         struct extent_buffer *leaf;
1899         struct btrfs_extent_item *item;
1900         u64 refs;
1901         int ret;
1902         int err = 0;
1903
1904         path = btrfs_alloc_path();
1905         if (!path)
1906                 return -ENOMEM;
1907
1908         path->reada = 1;
1909         path->leave_spinning = 1;
1910         /* this will setup the path even if it fails to insert the back ref */
1911         ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
1912                                            path, bytenr, num_bytes, parent,
1913                                            root_objectid, owner, offset,
1914                                            refs_to_add, extent_op);
1915         if (ret == 0)
1916                 goto out;
1917
1918         if (ret != -EAGAIN) {
1919                 err = ret;
1920                 goto out;
1921         }
1922
1923         leaf = path->nodes[0];
1924         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1925         refs = btrfs_extent_refs(leaf, item);
1926         btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1927         if (extent_op)
1928                 __run_delayed_extent_op(extent_op, leaf, item);
1929
1930         btrfs_mark_buffer_dirty(leaf);
1931         btrfs_release_path(path);
1932
1933         path->reada = 1;
1934         path->leave_spinning = 1;
1935
1936         /* now insert the actual backref */
1937         ret = insert_extent_backref(trans, root->fs_info->extent_root,
1938                                     path, bytenr, parent, root_objectid,
1939                                     owner, offset, refs_to_add);
1940         if (ret)
1941                 btrfs_abort_transaction(trans, root, ret);
1942 out:
1943         btrfs_free_path(path);
1944         return err;
1945 }
1946
1947 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1948                                 struct btrfs_root *root,
1949                                 struct btrfs_delayed_ref_node *node,
1950                                 struct btrfs_delayed_extent_op *extent_op,
1951                                 int insert_reserved)
1952 {
1953         int ret = 0;
1954         struct btrfs_delayed_data_ref *ref;
1955         struct btrfs_key ins;
1956         u64 parent = 0;
1957         u64 ref_root = 0;
1958         u64 flags = 0;
1959
1960         ins.objectid = node->bytenr;
1961         ins.offset = node->num_bytes;
1962         ins.type = BTRFS_EXTENT_ITEM_KEY;
1963
1964         ref = btrfs_delayed_node_to_data_ref(node);
1965         if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1966                 parent = ref->parent;
1967         else
1968                 ref_root = ref->root;
1969
1970         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1971                 if (extent_op) {
1972                         BUG_ON(extent_op->update_key);
1973                         flags |= extent_op->flags_to_set;
1974                 }
1975                 ret = alloc_reserved_file_extent(trans, root,
1976                                                  parent, ref_root, flags,
1977                                                  ref->objectid, ref->offset,
1978                                                  &ins, node->ref_mod);
1979         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1980                 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1981                                              node->num_bytes, parent,
1982                                              ref_root, ref->objectid,
1983                                              ref->offset, node->ref_mod,
1984                                              extent_op);
1985         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1986                 ret = __btrfs_free_extent(trans, root, node->bytenr,
1987                                           node->num_bytes, parent,
1988                                           ref_root, ref->objectid,
1989                                           ref->offset, node->ref_mod,
1990                                           extent_op);
1991         } else {
1992                 BUG();
1993         }
1994         return ret;
1995 }
1996
1997 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
1998                                     struct extent_buffer *leaf,
1999                                     struct btrfs_extent_item *ei)
2000 {
2001         u64 flags = btrfs_extent_flags(leaf, ei);
2002         if (extent_op->update_flags) {
2003                 flags |= extent_op->flags_to_set;
2004                 btrfs_set_extent_flags(leaf, ei, flags);
2005         }
2006
2007         if (extent_op->update_key) {
2008                 struct btrfs_tree_block_info *bi;
2009                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2010                 bi = (struct btrfs_tree_block_info *)(ei + 1);
2011                 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2012         }
2013 }
2014
2015 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2016                                  struct btrfs_root *root,
2017                                  struct btrfs_delayed_ref_node *node,
2018                                  struct btrfs_delayed_extent_op *extent_op)
2019 {
2020         struct btrfs_key key;
2021         struct btrfs_path *path;
2022         struct btrfs_extent_item *ei;
2023         struct extent_buffer *leaf;
2024         u32 item_size;
2025         int ret;
2026         int err = 0;
2027
2028         if (trans->aborted)
2029                 return 0;
2030
2031         path = btrfs_alloc_path();
2032         if (!path)
2033                 return -ENOMEM;
2034
2035         key.objectid = node->bytenr;
2036         key.type = BTRFS_EXTENT_ITEM_KEY;
2037         key.offset = node->num_bytes;
2038
2039         path->reada = 1;
2040         path->leave_spinning = 1;
2041         ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
2042                                 path, 0, 1);
2043         if (ret < 0) {
2044                 err = ret;
2045                 goto out;
2046         }
2047         if (ret > 0) {
2048                 err = -EIO;
2049                 goto out;
2050         }
2051
2052         leaf = path->nodes[0];
2053         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2054 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2055         if (item_size < sizeof(*ei)) {
2056                 ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
2057                                              path, (u64)-1, 0);
2058                 if (ret < 0) {
2059                         err = ret;
2060                         goto out;
2061                 }
2062                 leaf = path->nodes[0];
2063                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2064         }
2065 #endif
2066         BUG_ON(item_size < sizeof(*ei));
2067         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2068         __run_delayed_extent_op(extent_op, leaf, ei);
2069
2070         btrfs_mark_buffer_dirty(leaf);
2071 out:
2072         btrfs_free_path(path);
2073         return err;
2074 }
2075
2076 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2077                                 struct btrfs_root *root,
2078                                 struct btrfs_delayed_ref_node *node,
2079                                 struct btrfs_delayed_extent_op *extent_op,
2080                                 int insert_reserved)
2081 {
2082         int ret = 0;
2083         struct btrfs_delayed_tree_ref *ref;
2084         struct btrfs_key ins;
2085         u64 parent = 0;
2086         u64 ref_root = 0;
2087
2088         ins.objectid = node->bytenr;
2089         ins.offset = node->num_bytes;
2090         ins.type = BTRFS_EXTENT_ITEM_KEY;
2091
2092         ref = btrfs_delayed_node_to_tree_ref(node);
2093         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2094                 parent = ref->parent;
2095         else
2096                 ref_root = ref->root;
2097
2098         BUG_ON(node->ref_mod != 1);
2099         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2100                 BUG_ON(!extent_op || !extent_op->update_flags ||
2101                        !extent_op->update_key);
2102                 ret = alloc_reserved_tree_block(trans, root,
2103                                                 parent, ref_root,
2104                                                 extent_op->flags_to_set,
2105                                                 &extent_op->key,
2106                                                 ref->level, &ins);
2107         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2108                 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
2109                                              node->num_bytes, parent, ref_root,
2110                                              ref->level, 0, 1, extent_op);
2111         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2112                 ret = __btrfs_free_extent(trans, root, node->bytenr,
2113                                           node->num_bytes, parent, ref_root,
2114                                           ref->level, 0, 1, extent_op);
2115         } else {
2116                 BUG();
2117         }
2118         return ret;
2119 }
2120
2121 /* helper function to actually process a single delayed ref entry */
2122 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2123                                struct btrfs_root *root,
2124                                struct btrfs_delayed_ref_node *node,
2125                                struct btrfs_delayed_extent_op *extent_op,
2126                                int insert_reserved)
2127 {
2128         int ret = 0;
2129
2130         if (trans->aborted)
2131                 return 0;
2132
2133         if (btrfs_delayed_ref_is_head(node)) {
2134                 struct btrfs_delayed_ref_head *head;
2135                 /*
2136                  * we've hit the end of the chain and we were supposed
2137                  * to insert this extent into the tree.  But, it got
2138                  * deleted before we ever needed to insert it, so all
2139                  * we have to do is clean up the accounting
2140                  */
2141                 BUG_ON(extent_op);
2142                 head = btrfs_delayed_node_to_head(node);
2143                 if (insert_reserved) {
2144                         btrfs_pin_extent(root, node->bytenr,
2145                                          node->num_bytes, 1);
2146                         if (head->is_data) {
2147                                 ret = btrfs_del_csums(trans, root,
2148                                                       node->bytenr,
2149                                                       node->num_bytes);
2150                         }
2151                 }
2152                 return ret;
2153         }
2154
2155         if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2156             node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2157                 ret = run_delayed_tree_ref(trans, root, node, extent_op,
2158                                            insert_reserved);
2159         else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2160                  node->type == BTRFS_SHARED_DATA_REF_KEY)
2161                 ret = run_delayed_data_ref(trans, root, node, extent_op,
2162                                            insert_reserved);
2163         else
2164                 BUG();
2165         return ret;
2166 }
2167
2168 static noinline struct btrfs_delayed_ref_node *
2169 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2170 {
2171         struct rb_node *node;
2172         struct btrfs_delayed_ref_node *ref;
2173         int action = BTRFS_ADD_DELAYED_REF;
2174 again:
2175         /*
2176          * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
2177          * this prevents ref count from going down to zero when
2178          * there still are pending delayed ref.
2179          */
2180         node = rb_prev(&head->node.rb_node);
2181         while (1) {
2182                 if (!node)
2183                         break;
2184                 ref = rb_entry(node, struct btrfs_delayed_ref_node,
2185                                 rb_node);
2186                 if (ref->bytenr != head->node.bytenr)
2187                         break;
2188                 if (ref->action == action)
2189                         return ref;
2190                 node = rb_prev(node);
2191         }
2192         if (action == BTRFS_ADD_DELAYED_REF) {
2193                 action = BTRFS_DROP_DELAYED_REF;
2194                 goto again;
2195         }
2196         return NULL;
2197 }
2198
2199 /*
2200  * Returns 0 on success or if called with an already aborted transaction.
2201  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2202  */
2203 static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2204                                        struct btrfs_root *root,
2205                                        struct list_head *cluster)
2206 {
2207         struct btrfs_delayed_ref_root *delayed_refs;
2208         struct btrfs_delayed_ref_node *ref;
2209         struct btrfs_delayed_ref_head *locked_ref = NULL;
2210         struct btrfs_delayed_extent_op *extent_op;
2211         struct btrfs_fs_info *fs_info = root->fs_info;
2212         int ret;
2213         int count = 0;
2214         int must_insert_reserved = 0;
2215
2216         delayed_refs = &trans->transaction->delayed_refs;
2217         while (1) {
2218                 if (!locked_ref) {
2219                         /* pick a new head ref from the cluster list */
2220                         if (list_empty(cluster))
2221                                 break;
2222
2223                         locked_ref = list_entry(cluster->next,
2224                                      struct btrfs_delayed_ref_head, cluster);
2225
2226                         /* grab the lock that says we are going to process
2227                          * all the refs for this head */
2228                         ret = btrfs_delayed_ref_lock(trans, locked_ref);
2229
2230                         /*
2231                          * we may have dropped the spin lock to get the head
2232                          * mutex lock, and that might have given someone else
2233                          * time to free the head.  If that's true, it has been
2234                          * removed from our list and we can move on.
2235                          */
2236                         if (ret == -EAGAIN) {
2237                                 locked_ref = NULL;
2238                                 count++;
2239                                 continue;
2240                         }
2241                 }
2242
2243                 /*
2244                  * We need to try and merge add/drops of the same ref since we
2245                  * can run into issues with relocate dropping the implicit ref
2246                  * and then it being added back again before the drop can
2247                  * finish.  If we merged anything we need to re-loop so we can
2248                  * get a good ref.
2249                  */
2250                 btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
2251                                          locked_ref);
2252
2253                 /*
2254                  * locked_ref is the head node, so we have to go one
2255                  * node back for any delayed ref updates
2256                  */
2257                 ref = select_delayed_ref(locked_ref);
2258
2259                 if (ref && ref->seq &&
2260                     btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
2261                         /*
2262                          * there are still refs with lower seq numbers in the
2263                          * process of being added. Don't run this ref yet.
2264                          */
2265                         list_del_init(&locked_ref->cluster);
2266                         btrfs_delayed_ref_unlock(locked_ref);
2267                         locked_ref = NULL;
2268                         delayed_refs->num_heads_ready++;
2269                         spin_unlock(&delayed_refs->lock);
2270                         cond_resched();
2271                         spin_lock(&delayed_refs->lock);
2272                         continue;
2273                 }
2274
2275                 /*
2276                  * record the must insert reserved flag before we
2277                  * drop the spin lock.
2278                  */
2279                 must_insert_reserved = locked_ref->must_insert_reserved;
2280                 locked_ref->must_insert_reserved = 0;
2281
2282                 extent_op = locked_ref->extent_op;
2283                 locked_ref->extent_op = NULL;
2284
2285                 if (!ref) {
2286                         /* All delayed refs have been processed, Go ahead
2287                          * and send the head node to run_one_delayed_ref,
2288                          * so that any accounting fixes can happen
2289                          */
2290                         ref = &locked_ref->node;
2291
2292                         if (extent_op && must_insert_reserved) {
2293                                 btrfs_free_delayed_extent_op(extent_op);
2294                                 extent_op = NULL;
2295                         }
2296
2297                         if (extent_op) {
2298                                 spin_unlock(&delayed_refs->lock);
2299
2300                                 ret = run_delayed_extent_op(trans, root,
2301                                                             ref, extent_op);
2302                                 btrfs_free_delayed_extent_op(extent_op);
2303
2304                                 if (ret) {
2305                                         printk(KERN_DEBUG
2306                                                "btrfs: run_delayed_extent_op "
2307                                                "returned %d\n", ret);
2308                                         spin_lock(&delayed_refs->lock);
2309                                         btrfs_delayed_ref_unlock(locked_ref);
2310                                         return ret;
2311                                 }
2312
2313                                 goto next;
2314                         }
2315                 }
2316
2317                 ref->in_tree = 0;
2318                 rb_erase(&ref->rb_node, &delayed_refs->root);
2319                 delayed_refs->num_entries--;
2320                 if (!btrfs_delayed_ref_is_head(ref)) {
2321                         /*
2322                          * when we play the delayed ref, also correct the
2323                          * ref_mod on head
2324                          */
2325                         switch (ref->action) {
2326                         case BTRFS_ADD_DELAYED_REF:
2327                         case BTRFS_ADD_DELAYED_EXTENT:
2328                                 locked_ref->node.ref_mod -= ref->ref_mod;
2329                                 break;
2330                         case BTRFS_DROP_DELAYED_REF:
2331                                 locked_ref->node.ref_mod += ref->ref_mod;
2332                                 break;
2333                         default:
2334                                 WARN_ON(1);
2335                         }
2336                 }
2337                 spin_unlock(&delayed_refs->lock);
2338
2339                 ret = run_one_delayed_ref(trans, root, ref, extent_op,
2340                                           must_insert_reserved);
2341
2342                 btrfs_free_delayed_extent_op(extent_op);
2343                 if (ret) {
2344                         btrfs_delayed_ref_unlock(locked_ref);
2345                         btrfs_put_delayed_ref(ref);
2346                         printk(KERN_DEBUG
2347                                "btrfs: run_one_delayed_ref returned %d\n", ret);
2348                         spin_lock(&delayed_refs->lock);
2349                         return ret;
2350                 }
2351
2352                 /*
2353                  * If this node is a head, that means all the refs in this head
2354                  * have been dealt with, and we will pick the next head to deal
2355                  * with, so we must unlock the head and drop it from the cluster
2356                  * list before we release it.
2357                  */
2358                 if (btrfs_delayed_ref_is_head(ref)) {
2359                         list_del_init(&locked_ref->cluster);
2360                         btrfs_delayed_ref_unlock(locked_ref);
2361                         locked_ref = NULL;
2362                 }
2363                 btrfs_put_delayed_ref(ref);
2364                 count++;
2365 next:
2366                 cond_resched();
2367                 spin_lock(&delayed_refs->lock);
2368         }
2369         return count;
2370 }
2371
2372 #ifdef SCRAMBLE_DELAYED_REFS
2373 /*
2374  * Normally delayed refs get processed in ascending bytenr order. This
2375  * correlates in most cases to the order added. To expose dependencies on this
2376  * order, we start to process the tree in the middle instead of the beginning
2377  */
2378 static u64 find_middle(struct rb_root *root)
2379 {
2380         struct rb_node *n = root->rb_node;
2381         struct btrfs_delayed_ref_node *entry;
2382         int alt = 1;
2383         u64 middle;
2384         u64 first = 0, last = 0;
2385
2386         n = rb_first(root);
2387         if (n) {
2388                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2389                 first = entry->bytenr;
2390         }
2391         n = rb_last(root);
2392         if (n) {
2393                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2394                 last = entry->bytenr;
2395         }
2396         n = root->rb_node;
2397
2398         while (n) {
2399                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2400                 WARN_ON(!entry->in_tree);
2401
2402                 middle = entry->bytenr;
2403
2404                 if (alt)
2405                         n = n->rb_left;
2406                 else
2407                         n = n->rb_right;
2408
2409                 alt = 1 - alt;
2410         }
2411         return middle;
2412 }
2413 #endif
2414
2415 int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
2416                                          struct btrfs_fs_info *fs_info)
2417 {
2418         struct qgroup_update *qgroup_update;
2419         int ret = 0;
2420
2421         if (list_empty(&trans->qgroup_ref_list) !=
2422             !trans->delayed_ref_elem.seq) {
2423                 /* list without seq or seq without list */
2424                 printk(KERN_ERR "btrfs: qgroup accounting update error, list is%s empty, seq is %llu\n",
2425                         list_empty(&trans->qgroup_ref_list) ? "" : " not",
2426                         trans->delayed_ref_elem.seq);
2427                 BUG();
2428         }
2429
2430         if (!trans->delayed_ref_elem.seq)
2431                 return 0;
2432
2433         while (!list_empty(&trans->qgroup_ref_list)) {
2434                 qgroup_update = list_first_entry(&trans->qgroup_ref_list,
2435                                                  struct qgroup_update, list);
2436                 list_del(&qgroup_update->list);
2437                 if (!ret)
2438                         ret = btrfs_qgroup_account_ref(
2439                                         trans, fs_info, qgroup_update->node,
2440                                         qgroup_update->extent_op);
2441                 kfree(qgroup_update);
2442         }
2443
2444         btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
2445
2446         return ret;
2447 }
2448
2449 static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq,
2450                       int count)
2451 {
2452         int val = atomic_read(&delayed_refs->ref_seq);
2453
2454         if (val < seq || val >= seq + count)
2455                 return 1;
2456         return 0;
2457 }
2458
2459 /*
2460  * this starts processing the delayed reference count updates and
2461  * extent insertions we have queued up so far.  count can be
2462  * 0, which means to process everything in the tree at the start
2463  * of the run (but not newly added entries), or it can be some target
2464  * number you'd like to process.
2465  *
2466  * Returns 0 on success or if called with an aborted transaction
2467  * Returns <0 on error and aborts the transaction
2468  */
2469 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2470                            struct btrfs_root *root, unsigned long count)
2471 {
2472         struct rb_node *node;
2473         struct btrfs_delayed_ref_root *delayed_refs;
2474         struct btrfs_delayed_ref_node *ref;
2475         struct list_head cluster;
2476         int ret;
2477         u64 delayed_start;
2478         int run_all = count == (unsigned long)-1;
2479         int run_most = 0;
2480         int loops;
2481
2482         /* We'll clean this up in btrfs_cleanup_transaction */
2483         if (trans->aborted)
2484                 return 0;
2485
2486         if (root == root->fs_info->extent_root)
2487                 root = root->fs_info->tree_root;
2488
2489         btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
2490
2491         delayed_refs = &trans->transaction->delayed_refs;
2492         INIT_LIST_HEAD(&cluster);
2493         if (count == 0) {
2494                 count = delayed_refs->num_entries * 2;
2495                 run_most = 1;
2496         }
2497
2498         if (!run_all && !run_most) {
2499                 int old;
2500                 int seq = atomic_read(&delayed_refs->ref_seq);
2501
2502 progress:
2503                 old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
2504                 if (old) {
2505                         DEFINE_WAIT(__wait);
2506                         if (delayed_refs->num_entries < 16348)
2507                                 return 0;
2508
2509                         prepare_to_wait(&delayed_refs->wait, &__wait,
2510                                         TASK_UNINTERRUPTIBLE);
2511
2512                         old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
2513                         if (old) {
2514                                 schedule();
2515                                 finish_wait(&delayed_refs->wait, &__wait);
2516
2517                                 if (!refs_newer(delayed_refs, seq, 256))
2518                                         goto progress;
2519                                 else
2520                                         return 0;
2521                         } else {
2522                                 finish_wait(&delayed_refs->wait, &__wait);
2523                                 goto again;
2524                         }
2525                 }
2526
2527         } else {
2528                 atomic_inc(&delayed_refs->procs_running_refs);
2529         }
2530
2531 again:
2532         loops = 0;
2533         spin_lock(&delayed_refs->lock);
2534
2535 #ifdef SCRAMBLE_DELAYED_REFS
2536         delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2537 #endif
2538
2539         while (1) {
2540                 if (!(run_all || run_most) &&
2541                     delayed_refs->num_heads_ready < 64)
2542                         break;
2543
2544                 /*
2545                  * go find something we can process in the rbtree.  We start at
2546                  * the beginning of the tree, and then build a cluster
2547                  * of refs to process starting at the first one we are able to
2548                  * lock
2549                  */
2550                 delayed_start = delayed_refs->run_delayed_start;
2551                 ret = btrfs_find_ref_cluster(trans, &cluster,
2552                                              delayed_refs->run_delayed_start);
2553                 if (ret)
2554                         break;
2555
2556                 ret = run_clustered_refs(trans, root, &cluster);
2557                 if (ret < 0) {
2558                         btrfs_release_ref_cluster(&cluster);
2559                         spin_unlock(&delayed_refs->lock);
2560                         btrfs_abort_transaction(trans, root, ret);
2561                         atomic_dec(&delayed_refs->procs_running_refs);
2562                         return ret;
2563                 }
2564
2565                 atomic_add(ret, &delayed_refs->ref_seq);
2566
2567                 count -= min_t(unsigned long, ret, count);
2568
2569                 if (count == 0)
2570                         break;
2571
2572                 if (delayed_start >= delayed_refs->run_delayed_start) {
2573                         if (loops == 0) {
2574                                 /*
2575                                  * btrfs_find_ref_cluster looped. let's do one
2576                                  * more cycle. if we don't run any delayed ref
2577                                  * during that cycle (because we can't because
2578                                  * all of them are blocked), bail out.
2579                                  */
2580                                 loops = 1;
2581                         } else {
2582                                 /*
2583                                  * no runnable refs left, stop trying
2584                                  */
2585                                 BUG_ON(run_all);
2586                                 break;
2587                         }
2588                 }
2589                 if (ret) {
2590                         /* refs were run, let's reset staleness detection */
2591                         loops = 0;
2592                 }
2593         }
2594
2595         if (run_all) {
2596                 if (!list_empty(&trans->new_bgs)) {
2597                         spin_unlock(&delayed_refs->lock);
2598                         btrfs_create_pending_block_groups(trans, root);
2599                         spin_lock(&delayed_refs->lock);
2600                 }
2601
2602                 node = rb_first(&delayed_refs->root);
2603                 if (!node)
2604                         goto out;
2605                 count = (unsigned long)-1;
2606
2607                 while (node) {
2608                         ref = rb_entry(node, struct btrfs_delayed_ref_node,
2609                                        rb_node);
2610                         if (btrfs_delayed_ref_is_head(ref)) {
2611                                 struct btrfs_delayed_ref_head *head;
2612
2613                                 head = btrfs_delayed_node_to_head(ref);
2614                                 atomic_inc(&ref->refs);
2615
2616                                 spin_unlock(&delayed_refs->lock);
2617                                 /*
2618                                  * Mutex was contended, block until it's
2619                                  * released and try again
2620                                  */
2621                                 mutex_lock(&head->mutex);
2622                                 mutex_unlock(&head->mutex);
2623
2624                                 btrfs_put_delayed_ref(ref);
2625                                 cond_resched();
2626                                 goto again;
2627                         }
2628                         node = rb_next(node);
2629                 }
2630                 spin_unlock(&delayed_refs->lock);
2631                 schedule_timeout(1);
2632                 goto again;
2633         }
2634 out:
2635         atomic_dec(&delayed_refs->procs_running_refs);
2636         smp_mb();
2637         if (waitqueue_active(&delayed_refs->wait))
2638                 wake_up(&delayed_refs->wait);
2639
2640         spin_unlock(&delayed_refs->lock);
2641         assert_qgroups_uptodate(trans);
2642         return 0;
2643 }
2644
2645 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2646                                 struct btrfs_root *root,
2647                                 u64 bytenr, u64 num_bytes, u64 flags,
2648                                 int is_data)
2649 {
2650         struct btrfs_delayed_extent_op *extent_op;
2651         int ret;
2652
2653         extent_op = btrfs_alloc_delayed_extent_op();
2654         if (!extent_op)
2655                 return -ENOMEM;
2656
2657         extent_op->flags_to_set = flags;
2658         extent_op->update_flags = 1;
2659         extent_op->update_key = 0;
2660         extent_op->is_data = is_data ? 1 : 0;
2661
2662         ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
2663                                           num_bytes, extent_op);
2664         if (ret)
2665                 btrfs_free_delayed_extent_op(extent_op);
2666         return ret;
2667 }
2668
2669 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2670                                       struct btrfs_root *root,
2671                                       struct btrfs_path *path,
2672                                       u64 objectid, u64 offset, u64 bytenr)
2673 {
2674         struct btrfs_delayed_ref_head *head;
2675         struct btrfs_delayed_ref_node *ref;
2676         struct btrfs_delayed_data_ref *data_ref;
2677         struct btrfs_delayed_ref_root *delayed_refs;
2678         struct rb_node *node;
2679         int ret = 0;
2680
2681         ret = -ENOENT;
2682         delayed_refs = &trans->transaction->delayed_refs;
2683         spin_lock(&delayed_refs->lock);
2684         head = btrfs_find_delayed_ref_head(trans, bytenr);
2685         if (!head)
2686                 goto out;
2687
2688         if (!mutex_trylock(&head->mutex)) {
2689                 atomic_inc(&head->node.refs);
2690                 spin_unlock(&delayed_refs->lock);
2691
2692                 btrfs_release_path(path);
2693
2694                 /*
2695                  * Mutex was contended, block until it's released and let
2696                  * caller try again
2697                  */
2698                 mutex_lock(&head->mutex);
2699                 mutex_unlock(&head->mutex);
2700                 btrfs_put_delayed_ref(&head->node);
2701                 return -EAGAIN;
2702         }
2703
2704         node = rb_prev(&head->node.rb_node);
2705         if (!node)
2706                 goto out_unlock;
2707
2708         ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2709
2710         if (ref->bytenr != bytenr)
2711                 goto out_unlock;
2712
2713         ret = 1;
2714         if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
2715                 goto out_unlock;
2716
2717         data_ref = btrfs_delayed_node_to_data_ref(ref);
2718
2719         node = rb_prev(node);
2720         if (node) {
2721                 int seq = ref->seq;
2722
2723                 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2724                 if (ref->bytenr == bytenr && ref->seq == seq)
2725                         goto out_unlock;
2726         }
2727
2728         if (data_ref->root != root->root_key.objectid ||
2729             data_ref->objectid != objectid || data_ref->offset != offset)
2730                 goto out_unlock;
2731
2732         ret = 0;
2733 out_unlock:
2734         mutex_unlock(&head->mutex);
2735 out:
2736         spin_unlock(&delayed_refs->lock);
2737         return ret;
2738 }
2739
2740 static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
2741                                         struct btrfs_root *root,
2742                                         struct btrfs_path *path,
2743                                         u64 objectid, u64 offset, u64 bytenr)
2744 {
2745         struct btrfs_root *extent_root = root->fs_info->extent_root;
2746         struct extent_buffer *leaf;
2747         struct btrfs_extent_data_ref *ref;
2748         struct btrfs_extent_inline_ref *iref;
2749         struct btrfs_extent_item *ei;
2750         struct btrfs_key key;
2751         u32 item_size;
2752         int ret;
2753
2754         key.objectid = bytenr;
2755         key.offset = (u64)-1;
2756         key.type = BTRFS_EXTENT_ITEM_KEY;
2757
2758         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2759         if (ret < 0)
2760                 goto out;
2761         BUG_ON(ret == 0); /* Corruption */
2762
2763         ret = -ENOENT;
2764         if (path->slots[0] == 0)
2765                 goto out;
2766
2767         path->slots[0]--;
2768         leaf = path->nodes[0];
2769         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2770
2771         if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2772                 goto out;
2773
2774         ret = 1;
2775         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2776 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2777         if (item_size < sizeof(*ei)) {
2778                 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2779                 goto out;
2780         }
2781 #endif
2782         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2783
2784         if (item_size != sizeof(*ei) +
2785             btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2786                 goto out;
2787
2788         if (btrfs_extent_generation(leaf, ei) <=
2789             btrfs_root_last_snapshot(&root->root_item))
2790                 goto out;
2791
2792         iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2793         if (btrfs_extent_inline_ref_type(leaf, iref) !=
2794             BTRFS_EXTENT_DATA_REF_KEY)
2795                 goto out;
2796
2797         ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2798         if (btrfs_extent_refs(leaf, ei) !=
2799             btrfs_extent_data_ref_count(leaf, ref) ||
2800             btrfs_extent_data_ref_root(leaf, ref) !=
2801             root->root_key.objectid ||
2802             btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2803             btrfs_extent_data_ref_offset(leaf, ref) != offset)
2804                 goto out;
2805
2806         ret = 0;
2807 out:
2808         return ret;
2809 }
2810
2811 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2812                           struct btrfs_root *root,
2813                           u64 objectid, u64 offset, u64 bytenr)
2814 {
2815         struct btrfs_path *path;
2816         int ret;
2817         int ret2;
2818
2819         path = btrfs_alloc_path();
2820         if (!path)
2821                 return -ENOENT;
2822
2823         do {
2824                 ret = check_committed_ref(trans, root, path, objectid,
2825                                           offset, bytenr);
2826                 if (ret && ret != -ENOENT)
2827                         goto out;
2828
2829                 ret2 = check_delayed_ref(trans, root, path, objectid,
2830                                          offset, bytenr);
2831         } while (ret2 == -EAGAIN);
2832
2833         if (ret2 && ret2 != -ENOENT) {
2834                 ret = ret2;
2835                 goto out;
2836         }
2837
2838         if (ret != -ENOENT || ret2 != -ENOENT)
2839                 ret = 0;
2840 out:
2841         btrfs_free_path(path);
2842         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2843                 WARN_ON(ret > 0);
2844         return ret;
2845 }
2846
2847 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2848                            struct btrfs_root *root,
2849                            struct extent_buffer *buf,
2850                            int full_backref, int inc, int for_cow)
2851 {
2852         u64 bytenr;
2853         u64 num_bytes;
2854         u64 parent;
2855         u64 ref_root;
2856         u32 nritems;
2857         struct btrfs_key key;
2858         struct btrfs_file_extent_item *fi;
2859         int i;
2860         int level;
2861         int ret = 0;
2862         int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
2863                             u64, u64, u64, u64, u64, u64, int);
2864
2865         ref_root = btrfs_header_owner(buf);
2866         nritems = btrfs_header_nritems(buf);
2867         level = btrfs_header_level(buf);
2868
2869         if (!root->ref_cows && level == 0)
2870                 return 0;
2871
2872         if (inc)
2873                 process_func = btrfs_inc_extent_ref;
2874         else
2875                 process_func = btrfs_free_extent;
2876
2877         if (full_backref)
2878                 parent = buf->start;
2879         else
2880                 parent = 0;
2881
2882         for (i = 0; i < nritems; i++) {
2883                 if (level == 0) {
2884                         btrfs_item_key_to_cpu(buf, &key, i);
2885                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2886                                 continue;
2887                         fi = btrfs_item_ptr(buf, i,
2888                                             struct btrfs_file_extent_item);
2889                         if (btrfs_file_extent_type(buf, fi) ==
2890                             BTRFS_FILE_EXTENT_INLINE)
2891                                 continue;
2892                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2893                         if (bytenr == 0)
2894                                 continue;
2895
2896                         num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2897                         key.offset -= btrfs_file_extent_offset(buf, fi);
2898                         ret = process_func(trans, root, bytenr, num_bytes,
2899                                            parent, ref_root, key.objectid,
2900                                            key.offset, for_cow);
2901                         if (ret)
2902                                 goto fail;
2903                 } else {
2904                         bytenr = btrfs_node_blockptr(buf, i);
2905                         num_bytes = btrfs_level_size(root, level - 1);
2906                         ret = process_func(trans, root, bytenr, num_bytes,
2907                                            parent, ref_root, level - 1, 0,
2908                                            for_cow);
2909                         if (ret)
2910                                 goto fail;
2911                 }
2912         }
2913         return 0;
2914 fail:
2915         return ret;
2916 }
2917
2918 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2919                   struct extent_buffer *buf, int full_backref, int for_cow)
2920 {
2921         return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
2922 }
2923
2924 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2925                   struct extent_buffer *buf, int full_backref, int for_cow)
2926 {
2927         return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
2928 }
2929
2930 static int write_one_cache_group(struct btrfs_trans_handle *trans,
2931                                  struct btrfs_root *root,
2932                                  struct btrfs_path *path,
2933                                  struct btrfs_block_group_cache *cache)
2934 {
2935         int ret;
2936         struct btrfs_root *extent_root = root->fs_info->extent_root;
2937         unsigned long bi;
2938         struct extent_buffer *leaf;
2939
2940         ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
2941         if (ret < 0)
2942                 goto fail;
2943         BUG_ON(ret); /* Corruption */
2944
2945         leaf = path->nodes[0];
2946         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
2947         write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
2948         btrfs_mark_buffer_dirty(leaf);
2949         btrfs_release_path(path);
2950 fail:
2951         if (ret) {
2952                 btrfs_abort_transaction(trans, root, ret);
2953                 return ret;
2954         }
2955         return 0;
2956
2957 }
2958
2959 static struct btrfs_block_group_cache *
2960 next_block_group(struct btrfs_root *root,
2961                  struct btrfs_block_group_cache *cache)
2962 {
2963         struct rb_node *node;
2964         spin_lock(&root->fs_info->block_group_cache_lock);
2965         node = rb_next(&cache->cache_node);
2966         btrfs_put_block_group(cache);
2967         if (node) {
2968                 cache = rb_entry(node, struct btrfs_block_group_cache,
2969                                  cache_node);
2970                 btrfs_get_block_group(cache);
2971         } else
2972                 cache = NULL;
2973         spin_unlock(&root->fs_info->block_group_cache_lock);
2974         return cache;
2975 }
2976
2977 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
2978                             struct btrfs_trans_handle *trans,
2979                             struct btrfs_path *path)
2980 {
2981         struct btrfs_root *root = block_group->fs_info->tree_root;
2982         struct inode *inode = NULL;
2983         u64 alloc_hint = 0;
2984         int dcs = BTRFS_DC_ERROR;
2985         int num_pages = 0;
2986         int retries = 0;
2987         int ret = 0;
2988
2989         /*
2990          * If this block group is smaller than 100 megs don't bother caching the
2991          * block group.
2992          */
2993         if (block_group->key.offset < (100 * 1024 * 1024)) {
2994                 spin_lock(&block_group->lock);
2995                 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
2996                 spin_unlock(&block_group->lock);
2997                 return 0;
2998         }
2999
3000 again:
3001         inode = lookup_free_space_inode(root, block_group, path);
3002         if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3003                 ret = PTR_ERR(inode);
3004                 btrfs_release_path(path);
3005                 goto out;
3006         }
3007
3008         if (IS_ERR(inode)) {
3009                 BUG_ON(retries);
3010                 retries++;
3011
3012                 if (block_group->ro)
3013                         goto out_free;
3014
3015                 ret = create_free_space_inode(root, trans, block_group, path);
3016                 if (ret)
3017                         goto out_free;
3018                 goto again;
3019         }
3020
3021         /* We've already setup this transaction, go ahead and exit */
3022         if (block_group->cache_generation == trans->transid &&
3023             i_size_read(inode)) {
3024                 dcs = BTRFS_DC_SETUP;
3025                 goto out_put;
3026         }
3027
3028         /*
3029          * We want to set the generation to 0, that way if anything goes wrong
3030          * from here on out we know not to trust this cache when we load up next
3031          * time.
3032          */
3033         BTRFS_I(inode)->generation = 0;
3034         ret = btrfs_update_inode(trans, root, inode);
3035         WARN_ON(ret);
3036
3037         if (i_size_read(inode) > 0) {
3038                 ret = btrfs_truncate_free_space_cache(root, trans, path,
3039                                                       inode);
3040                 if (ret)
3041                         goto out_put;
3042         }
3043
3044         spin_lock(&block_group->lock);
3045         if (block_group->cached != BTRFS_CACHE_FINISHED ||
3046             !btrfs_test_opt(root, SPACE_CACHE)) {
3047                 /*
3048                  * don't bother trying to write stuff out _if_
3049                  * a) we're not cached,
3050                  * b) we're with nospace_cache mount option.
3051                  */
3052                 dcs = BTRFS_DC_WRITTEN;
3053                 spin_unlock(&block_group->lock);
3054                 goto out_put;
3055         }
3056         spin_unlock(&block_group->lock);
3057
3058         /*
3059          * Try to preallocate enough space based on how big the block group is.
3060          * Keep in mind this has to include any pinned space which could end up
3061          * taking up quite a bit since it's not folded into the other space
3062          * cache.
3063          */
3064         num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024);
3065         if (!num_pages)
3066                 num_pages = 1;
3067
3068         num_pages *= 16;
3069         num_pages *= PAGE_CACHE_SIZE;
3070
3071         ret = btrfs_check_data_free_space(inode, num_pages);
3072         if (ret)
3073                 goto out_put;
3074
3075         ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3076                                               num_pages, num_pages,
3077                                               &alloc_hint);
3078         if (!ret)
3079                 dcs = BTRFS_DC_SETUP;
3080         btrfs_free_reserved_data_space(inode, num_pages);
3081
3082 out_put:
3083         iput(inode);
3084 out_free:
3085         btrfs_release_path(path);
3086 out:
3087         spin_lock(&block_group->lock);
3088         if (!ret && dcs == BTRFS_DC_SETUP)
3089                 block_group->cache_generation = trans->transid;
3090         block_group->disk_cache_state = dcs;
3091         spin_unlock(&block_group->lock);
3092
3093         return ret;
3094 }
3095
3096 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3097                                    struct btrfs_root *root)
3098 {
3099         struct btrfs_block_group_cache *cache;
3100         int err = 0;
3101         struct btrfs_path *path;
3102         u64 last = 0;
3103
3104         path = btrfs_alloc_path();
3105         if (!path)
3106                 return -ENOMEM;
3107
3108 again:
3109         while (1) {
3110                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
3111                 while (cache) {
3112                         if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3113                                 break;
3114                         cache = next_block_group(root, cache);
3115                 }
3116                 if (!cache) {
3117                         if (last == 0)
3118                                 break;
3119                         last = 0;
3120                         continue;
3121                 }
3122                 err = cache_save_setup(cache, trans, path);
3123                 last = cache->key.objectid + cache->key.offset;
3124                 btrfs_put_block_group(cache);
3125         }
3126
3127         while (1) {
3128                 if (last == 0) {
3129                         err = btrfs_run_delayed_refs(trans, root,
3130                                                      (unsigned long)-1);
3131                         if (err) /* File system offline */
3132                                 goto out;
3133                 }
3134
3135                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
3136                 while (cache) {
3137                         if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
3138                                 btrfs_put_block_group(cache);
3139                                 goto again;
3140                         }
3141
3142                         if (cache->dirty)
3143                                 break;
3144                         cache = next_block_group(root, cache);
3145                 }
3146                 if (!cache) {
3147                         if (last == 0)
3148                                 break;
3149                         last = 0;
3150                         continue;
3151                 }
3152
3153                 if (cache->disk_cache_state == BTRFS_DC_SETUP)
3154                         cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
3155                 cache->dirty = 0;
3156                 last = cache->key.objectid + cache->key.offset;
3157
3158                 err = write_one_cache_group(trans, root, path, cache);
3159                 if (err) /* File system offline */
3160                         goto out;