]> git.openfabrics.org - ~shefty/rdma-dev.git/blob - fs/btrfs/free-space-cache.c
Merge git://git.kernel.org/pub/scm/linux/kernel/git/hch/xfs-icache-races
[~shefty/rdma-dev.git] / fs / btrfs / free-space-cache.c
1 /*
2  * Copyright (C) 2008 Red Hat.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/pagemap.h>
20 #include <linux/sched.h>
21 #include <linux/math64.h>
22 #include "ctree.h"
23 #include "free-space-cache.h"
24 #include "transaction.h"
25
26 #define BITS_PER_BITMAP         (PAGE_CACHE_SIZE * 8)
27 #define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
28
29 static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize,
30                                           u64 offset)
31 {
32         BUG_ON(offset < bitmap_start);
33         offset -= bitmap_start;
34         return (unsigned long)(div64_u64(offset, sectorsize));
35 }
36
37 static inline unsigned long bytes_to_bits(u64 bytes, u64 sectorsize)
38 {
39         return (unsigned long)(div64_u64(bytes, sectorsize));
40 }
41
42 static inline u64 offset_to_bitmap(struct btrfs_block_group_cache *block_group,
43                                    u64 offset)
44 {
45         u64 bitmap_start;
46         u64 bytes_per_bitmap;
47
48         bytes_per_bitmap = BITS_PER_BITMAP * block_group->sectorsize;
49         bitmap_start = offset - block_group->key.objectid;
50         bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
51         bitmap_start *= bytes_per_bitmap;
52         bitmap_start += block_group->key.objectid;
53
54         return bitmap_start;
55 }
56
57 static int tree_insert_offset(struct rb_root *root, u64 offset,
58                               struct rb_node *node, int bitmap)
59 {
60         struct rb_node **p = &root->rb_node;
61         struct rb_node *parent = NULL;
62         struct btrfs_free_space *info;
63
64         while (*p) {
65                 parent = *p;
66                 info = rb_entry(parent, struct btrfs_free_space, offset_index);
67
68                 if (offset < info->offset) {
69                         p = &(*p)->rb_left;
70                 } else if (offset > info->offset) {
71                         p = &(*p)->rb_right;
72                 } else {
73                         /*
74                          * we could have a bitmap entry and an extent entry
75                          * share the same offset.  If this is the case, we want
76                          * the extent entry to always be found first if we do a
77                          * linear search through the tree, since we want to have
78                          * the quickest allocation time, and allocating from an
79                          * extent is faster than allocating from a bitmap.  So
80                          * if we're inserting a bitmap and we find an entry at
81                          * this offset, we want to go right, or after this entry
82                          * logically.  If we are inserting an extent and we've
83                          * found a bitmap, we want to go left, or before
84                          * logically.
85                          */
86                         if (bitmap) {
87                                 WARN_ON(info->bitmap);
88                                 p = &(*p)->rb_right;
89                         } else {
90                                 WARN_ON(!info->bitmap);
91                                 p = &(*p)->rb_left;
92                         }
93                 }
94         }
95
96         rb_link_node(node, parent, p);
97         rb_insert_color(node, root);
98
99         return 0;
100 }
101
102 /*
103  * searches the tree for the given offset.
104  *
105  * fuzzy - If this is set, then we are trying to make an allocation, and we just
106  * want a section that has at least bytes size and comes at or after the given
107  * offset.
108  */
109 static struct btrfs_free_space *
110 tree_search_offset(struct btrfs_block_group_cache *block_group,
111                    u64 offset, int bitmap_only, int fuzzy)
112 {
113         struct rb_node *n = block_group->free_space_offset.rb_node;
114         struct btrfs_free_space *entry, *prev = NULL;
115
116         /* find entry that is closest to the 'offset' */
117         while (1) {
118                 if (!n) {
119                         entry = NULL;
120                         break;
121                 }
122
123                 entry = rb_entry(n, struct btrfs_free_space, offset_index);
124                 prev = entry;
125
126                 if (offset < entry->offset)
127                         n = n->rb_left;
128                 else if (offset > entry->offset)
129                         n = n->rb_right;
130                 else
131                         break;
132         }
133
134         if (bitmap_only) {
135                 if (!entry)
136                         return NULL;
137                 if (entry->bitmap)
138                         return entry;
139
140                 /*
141                  * bitmap entry and extent entry may share same offset,
142                  * in that case, bitmap entry comes after extent entry.
143                  */
144                 n = rb_next(n);
145                 if (!n)
146                         return NULL;
147                 entry = rb_entry(n, struct btrfs_free_space, offset_index);
148                 if (entry->offset != offset)
149                         return NULL;
150
151                 WARN_ON(!entry->bitmap);
152                 return entry;
153         } else if (entry) {
154                 if (entry->bitmap) {
155                         /*
156                          * if previous extent entry covers the offset,
157                          * we should return it instead of the bitmap entry
158                          */
159                         n = &entry->offset_index;
160                         while (1) {
161                                 n = rb_prev(n);
162                                 if (!n)
163                                         break;
164                                 prev = rb_entry(n, struct btrfs_free_space,
165                                                 offset_index);
166                                 if (!prev->bitmap) {
167                                         if (prev->offset + prev->bytes > offset)
168                                                 entry = prev;
169                                         break;
170                                 }
171                         }
172                 }
173                 return entry;
174         }
175
176         if (!prev)
177                 return NULL;
178
179         /* find last entry before the 'offset' */
180         entry = prev;
181         if (entry->offset > offset) {
182                 n = rb_prev(&entry->offset_index);
183                 if (n) {
184                         entry = rb_entry(n, struct btrfs_free_space,
185                                         offset_index);
186                         BUG_ON(entry->offset > offset);
187                 } else {
188                         if (fuzzy)
189                                 return entry;
190                         else
191                                 return NULL;
192                 }
193         }
194
195         if (entry->bitmap) {
196                 n = &entry->offset_index;
197                 while (1) {
198                         n = rb_prev(n);
199                         if (!n)
200                                 break;
201                         prev = rb_entry(n, struct btrfs_free_space,
202                                         offset_index);
203                         if (!prev->bitmap) {
204                                 if (prev->offset + prev->bytes > offset)
205                                         return prev;
206                                 break;
207                         }
208                 }
209                 if (entry->offset + BITS_PER_BITMAP *
210                     block_group->sectorsize > offset)
211                         return entry;
212         } else if (entry->offset + entry->bytes > offset)
213                 return entry;
214
215         if (!fuzzy)
216                 return NULL;
217
218         while (1) {
219                 if (entry->bitmap) {
220                         if (entry->offset + BITS_PER_BITMAP *
221                             block_group->sectorsize > offset)
222                                 break;
223                 } else {
224                         if (entry->offset + entry->bytes > offset)
225                                 break;
226                 }
227
228                 n = rb_next(&entry->offset_index);
229                 if (!n)
230                         return NULL;
231                 entry = rb_entry(n, struct btrfs_free_space, offset_index);
232         }
233         return entry;
234 }
235
236 static void unlink_free_space(struct btrfs_block_group_cache *block_group,
237                               struct btrfs_free_space *info)
238 {
239         rb_erase(&info->offset_index, &block_group->free_space_offset);
240         block_group->free_extents--;
241         block_group->free_space -= info->bytes;
242 }
243
244 static int link_free_space(struct btrfs_block_group_cache *block_group,
245                            struct btrfs_free_space *info)
246 {
247         int ret = 0;
248
249         BUG_ON(!info->bitmap && !info->bytes);
250         ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
251                                  &info->offset_index, (info->bitmap != NULL));
252         if (ret)
253                 return ret;
254
255         block_group->free_space += info->bytes;
256         block_group->free_extents++;
257         return ret;
258 }
259
260 static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
261 {
262         u64 max_bytes, possible_bytes;
263
264         /*
265          * The goal is to keep the total amount of memory used per 1gb of space
266          * at or below 32k, so we need to adjust how much memory we allow to be
267          * used by extent based free space tracking
268          */
269         max_bytes = MAX_CACHE_BYTES_PER_GIG *
270                 (div64_u64(block_group->key.offset, 1024 * 1024 * 1024));
271
272         possible_bytes = (block_group->total_bitmaps * PAGE_CACHE_SIZE) +
273                 (sizeof(struct btrfs_free_space) *
274                  block_group->extents_thresh);
275
276         if (possible_bytes > max_bytes) {
277                 int extent_bytes = max_bytes -
278                         (block_group->total_bitmaps * PAGE_CACHE_SIZE);
279
280                 if (extent_bytes <= 0) {
281                         block_group->extents_thresh = 0;
282                         return;
283                 }
284
285                 block_group->extents_thresh = extent_bytes /
286                         (sizeof(struct btrfs_free_space));
287         }
288 }
289
290 static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group,
291                               struct btrfs_free_space *info, u64 offset,
292                               u64 bytes)
293 {
294         unsigned long start, end;
295         unsigned long i;
296
297         start = offset_to_bit(info->offset, block_group->sectorsize, offset);
298         end = start + bytes_to_bits(bytes, block_group->sectorsize);
299         BUG_ON(end > BITS_PER_BITMAP);
300
301         for (i = start; i < end; i++)
302                 clear_bit(i, info->bitmap);
303
304         info->bytes -= bytes;
305         block_group->free_space -= bytes;
306 }
307
308 static void bitmap_set_bits(struct btrfs_block_group_cache *block_group,
309                             struct btrfs_free_space *info, u64 offset,
310                             u64 bytes)
311 {
312         unsigned long start, end;
313         unsigned long i;
314
315         start = offset_to_bit(info->offset, block_group->sectorsize, offset);
316         end = start + bytes_to_bits(bytes, block_group->sectorsize);
317         BUG_ON(end > BITS_PER_BITMAP);
318
319         for (i = start; i < end; i++)
320                 set_bit(i, info->bitmap);
321
322         info->bytes += bytes;
323         block_group->free_space += bytes;
324 }
325
326 static int search_bitmap(struct btrfs_block_group_cache *block_group,
327                          struct btrfs_free_space *bitmap_info, u64 *offset,
328                          u64 *bytes)
329 {
330         unsigned long found_bits = 0;
331         unsigned long bits, i;
332         unsigned long next_zero;
333
334         i = offset_to_bit(bitmap_info->offset, block_group->sectorsize,
335                           max_t(u64, *offset, bitmap_info->offset));
336         bits = bytes_to_bits(*bytes, block_group->sectorsize);
337
338         for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i);
339              i < BITS_PER_BITMAP;
340              i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) {
341                 next_zero = find_next_zero_bit(bitmap_info->bitmap,
342                                                BITS_PER_BITMAP, i);
343                 if ((next_zero - i) >= bits) {
344                         found_bits = next_zero - i;
345                         break;
346                 }
347                 i = next_zero;
348         }
349
350         if (found_bits) {
351                 *offset = (u64)(i * block_group->sectorsize) +
352                         bitmap_info->offset;
353                 *bytes = (u64)(found_bits) * block_group->sectorsize;
354                 return 0;
355         }
356
357         return -1;
358 }
359
360 static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache
361                                                 *block_group, u64 *offset,
362                                                 u64 *bytes, int debug)
363 {
364         struct btrfs_free_space *entry;
365         struct rb_node *node;
366         int ret;
367
368         if (!block_group->free_space_offset.rb_node)
369                 return NULL;
370
371         entry = tree_search_offset(block_group,
372                                    offset_to_bitmap(block_group, *offset),
373                                    0, 1);
374         if (!entry)
375                 return NULL;
376
377         for (node = &entry->offset_index; node; node = rb_next(node)) {
378                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
379                 if (entry->bytes < *bytes)
380                         continue;
381
382                 if (entry->bitmap) {
383                         ret = search_bitmap(block_group, entry, offset, bytes);
384                         if (!ret)
385                                 return entry;
386                         continue;
387                 }
388
389                 *offset = entry->offset;
390                 *bytes = entry->bytes;
391                 return entry;
392         }
393
394         return NULL;
395 }
396
397 static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
398                            struct btrfs_free_space *info, u64 offset)
399 {
400         u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
401         int max_bitmaps = (int)div64_u64(block_group->key.offset +
402                                          bytes_per_bg - 1, bytes_per_bg);
403         BUG_ON(block_group->total_bitmaps >= max_bitmaps);
404
405         info->offset = offset_to_bitmap(block_group, offset);
406         link_free_space(block_group, info);
407         block_group->total_bitmaps++;
408
409         recalculate_thresholds(block_group);
410 }
411
412 static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group,
413                               struct btrfs_free_space *bitmap_info,
414                               u64 *offset, u64 *bytes)
415 {
416         u64 end;
417
418 again:
419         end = bitmap_info->offset +
420                 (u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1;
421
422         if (*offset > bitmap_info->offset && *offset + *bytes > end) {
423                 bitmap_clear_bits(block_group, bitmap_info, *offset,
424                                   end - *offset + 1);
425                 *bytes -= end - *offset + 1;
426                 *offset = end + 1;
427         } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) {
428                 bitmap_clear_bits(block_group, bitmap_info, *offset, *bytes);
429                 *bytes = 0;
430         }
431
432         if (*bytes) {
433                 if (!bitmap_info->bytes) {
434                         unlink_free_space(block_group, bitmap_info);
435                         kfree(bitmap_info->bitmap);
436                         kfree(bitmap_info);
437                         block_group->total_bitmaps--;
438                         recalculate_thresholds(block_group);
439                 }
440
441                 bitmap_info = tree_search_offset(block_group,
442                                                  offset_to_bitmap(block_group,
443                                                                   *offset),
444                                                  1, 0);
445                 if (!bitmap_info)
446                         return -EINVAL;
447
448                 if (!bitmap_info->bitmap)
449                         return -EAGAIN;
450
451                 goto again;
452         } else if (!bitmap_info->bytes) {
453                 unlink_free_space(block_group, bitmap_info);
454                 kfree(bitmap_info->bitmap);
455                 kfree(bitmap_info);
456                 block_group->total_bitmaps--;
457                 recalculate_thresholds(block_group);
458         }
459
460         return 0;
461 }
462
463 static int insert_into_bitmap(struct btrfs_block_group_cache *block_group,
464                               struct btrfs_free_space *info)
465 {
466         struct btrfs_free_space *bitmap_info;
467         int added = 0;
468         u64 bytes, offset, end;
469         int ret;
470
471         /*
472          * If we are below the extents threshold then we can add this as an
473          * extent, and don't have to deal with the bitmap
474          */
475         if (block_group->free_extents < block_group->extents_thresh &&
476             info->bytes > block_group->sectorsize * 4)
477                 return 0;
478
479         /*
480          * some block groups are so tiny they can't be enveloped by a bitmap, so
481          * don't even bother to create a bitmap for this
482          */
483         if (BITS_PER_BITMAP * block_group->sectorsize >
484             block_group->key.offset)
485                 return 0;
486
487         bytes = info->bytes;
488         offset = info->offset;
489
490 again:
491         bitmap_info = tree_search_offset(block_group,
492                                          offset_to_bitmap(block_group, offset),
493                                          1, 0);
494         if (!bitmap_info) {
495                 BUG_ON(added);
496                 goto new_bitmap;
497         }
498
499         end = bitmap_info->offset +
500                 (u64)(BITS_PER_BITMAP * block_group->sectorsize);
501
502         if (offset >= bitmap_info->offset && offset + bytes > end) {
503                 bitmap_set_bits(block_group, bitmap_info, offset,
504                                 end - offset);
505                 bytes -= end - offset;
506                 offset = end;
507                 added = 0;
508         } else if (offset >= bitmap_info->offset && offset + bytes <= end) {
509                 bitmap_set_bits(block_group, bitmap_info, offset, bytes);
510                 bytes = 0;
511         } else {
512                 BUG();
513         }
514
515         if (!bytes) {
516                 ret = 1;
517                 goto out;
518         } else
519                 goto again;
520
521 new_bitmap:
522         if (info && info->bitmap) {
523                 add_new_bitmap(block_group, info, offset);
524                 added = 1;
525                 info = NULL;
526                 goto again;
527         } else {
528                 spin_unlock(&block_group->tree_lock);
529
530                 /* no pre-allocated info, allocate a new one */
531                 if (!info) {
532                         info = kzalloc(sizeof(struct btrfs_free_space),
533                                        GFP_NOFS);
534                         if (!info) {
535                                 spin_lock(&block_group->tree_lock);
536                                 ret = -ENOMEM;
537                                 goto out;
538                         }
539                 }
540
541                 /* allocate the bitmap */
542                 info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
543                 spin_lock(&block_group->tree_lock);
544                 if (!info->bitmap) {
545                         ret = -ENOMEM;
546                         goto out;
547                 }
548                 goto again;
549         }
550
551 out:
552         if (info) {
553                 if (info->bitmap)
554                         kfree(info->bitmap);
555                 kfree(info);
556         }
557
558         return ret;
559 }
560
561 int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
562                          u64 offset, u64 bytes)
563 {
564         struct btrfs_free_space *right_info = NULL;
565         struct btrfs_free_space *left_info = NULL;
566         struct btrfs_free_space *info = NULL;
567         int ret = 0;
568
569         info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
570         if (!info)
571                 return -ENOMEM;
572
573         info->offset = offset;
574         info->bytes = bytes;
575
576         spin_lock(&block_group->tree_lock);
577
578         /*
579          * first we want to see if there is free space adjacent to the range we
580          * are adding, if there is remove that struct and add a new one to
581          * cover the entire range
582          */
583         right_info = tree_search_offset(block_group, offset + bytes, 0, 0);
584         if (right_info && rb_prev(&right_info->offset_index))
585                 left_info = rb_entry(rb_prev(&right_info->offset_index),
586                                      struct btrfs_free_space, offset_index);
587         else
588                 left_info = tree_search_offset(block_group, offset - 1, 0, 0);
589
590         /*
591          * If there was no extent directly to the left or right of this new
592          * extent then we know we're going to have to allocate a new extent, so
593          * before we do that see if we need to drop this into a bitmap
594          */
595         if ((!left_info || left_info->bitmap) &&
596             (!right_info || right_info->bitmap)) {
597                 ret = insert_into_bitmap(block_group, info);
598
599                 if (ret < 0) {
600                         goto out;
601                 } else if (ret) {
602                         ret = 0;
603                         goto out;
604                 }
605         }
606
607         if (right_info && !right_info->bitmap) {
608                 unlink_free_space(block_group, right_info);
609                 info->bytes += right_info->bytes;
610                 kfree(right_info);
611         }
612
613         if (left_info && !left_info->bitmap &&
614             left_info->offset + left_info->bytes == offset) {
615                 unlink_free_space(block_group, left_info);
616                 info->offset = left_info->offset;
617                 info->bytes += left_info->bytes;
618                 kfree(left_info);
619         }
620
621         ret = link_free_space(block_group, info);
622         if (ret)
623                 kfree(info);
624 out:
625         spin_unlock(&block_group->tree_lock);
626
627         if (ret) {
628                 printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret);
629                 BUG_ON(ret == -EEXIST);
630         }
631
632         return ret;
633 }
634
635 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
636                             u64 offset, u64 bytes)
637 {
638         struct btrfs_free_space *info;
639         struct btrfs_free_space *next_info = NULL;
640         int ret = 0;
641
642         spin_lock(&block_group->tree_lock);
643
644 again:
645         info = tree_search_offset(block_group, offset, 0, 0);
646         if (!info) {
647                 WARN_ON(1);
648                 goto out_lock;
649         }
650
651         if (info->bytes < bytes && rb_next(&info->offset_index)) {
652                 u64 end;
653                 next_info = rb_entry(rb_next(&info->offset_index),
654                                              struct btrfs_free_space,
655                                              offset_index);
656
657                 if (next_info->bitmap)
658                         end = next_info->offset + BITS_PER_BITMAP *
659                                 block_group->sectorsize - 1;
660                 else
661                         end = next_info->offset + next_info->bytes;
662
663                 if (next_info->bytes < bytes ||
664                     next_info->offset > offset || offset > end) {
665                         printk(KERN_CRIT "Found free space at %llu, size %llu,"
666                               " trying to use %llu\n",
667                               (unsigned long long)info->offset,
668                               (unsigned long long)info->bytes,
669                               (unsigned long long)bytes);
670                         WARN_ON(1);
671                         ret = -EINVAL;
672                         goto out_lock;
673                 }
674
675                 info = next_info;
676         }
677
678         if (info->bytes == bytes) {
679                 unlink_free_space(block_group, info);
680                 if (info->bitmap) {
681                         kfree(info->bitmap);
682                         block_group->total_bitmaps--;
683                 }
684                 kfree(info);
685                 goto out_lock;
686         }
687
688         if (!info->bitmap && info->offset == offset) {
689                 unlink_free_space(block_group, info);
690                 info->offset += bytes;
691                 info->bytes -= bytes;
692                 link_free_space(block_group, info);
693                 goto out_lock;
694         }
695
696         if (!info->bitmap && info->offset <= offset &&
697             info->offset + info->bytes >= offset + bytes) {
698                 u64 old_start = info->offset;
699                 /*
700                  * we're freeing space in the middle of the info,
701                  * this can happen during tree log replay
702                  *
703                  * first unlink the old info and then
704                  * insert it again after the hole we're creating
705                  */
706                 unlink_free_space(block_group, info);
707                 if (offset + bytes < info->offset + info->bytes) {
708                         u64 old_end = info->offset + info->bytes;
709
710                         info->offset = offset + bytes;
711                         info->bytes = old_end - info->offset;
712                         ret = link_free_space(block_group, info);
713                         WARN_ON(ret);
714                         if (ret)
715                                 goto out_lock;
716                 } else {
717                         /* the hole we're creating ends at the end
718                          * of the info struct, just free the info
719                          */
720                         kfree(info);
721                 }
722                 spin_unlock(&block_group->tree_lock);
723
724                 /* step two, insert a new info struct to cover
725                  * anything before the hole
726                  */
727                 ret = btrfs_add_free_space(block_group, old_start,
728                                            offset - old_start);
729                 WARN_ON(ret);
730                 goto out;
731         }
732
733         ret = remove_from_bitmap(block_group, info, &offset, &bytes);
734         if (ret == -EAGAIN)
735                 goto again;
736         BUG_ON(ret);
737 out_lock:
738         spin_unlock(&block_group->tree_lock);
739 out:
740         return ret;
741 }
742
743 void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
744                            u64 bytes)
745 {
746         struct btrfs_free_space *info;
747         struct rb_node *n;
748         int count = 0;
749
750         for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) {
751                 info = rb_entry(n, struct btrfs_free_space, offset_index);
752                 if (info->bytes >= bytes)
753                         count++;
754                 printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n",
755                        (unsigned long long)info->offset,
756                        (unsigned long long)info->bytes,
757                        (info->bitmap) ? "yes" : "no");
758         }
759         printk(KERN_INFO "block group has cluster?: %s\n",
760                list_empty(&block_group->cluster_list) ? "no" : "yes");
761         printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
762                "\n", count);
763 }
764
765 u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
766 {
767         struct btrfs_free_space *info;
768         struct rb_node *n;
769         u64 ret = 0;
770
771         for (n = rb_first(&block_group->free_space_offset); n;
772              n = rb_next(n)) {
773                 info = rb_entry(n, struct btrfs_free_space, offset_index);
774                 ret += info->bytes;
775         }
776
777         return ret;
778 }
779
780 /*
781  * for a given cluster, put all of its extents back into the free
782  * space cache.  If the block group passed doesn't match the block group
783  * pointed to by the cluster, someone else raced in and freed the
784  * cluster already.  In that case, we just return without changing anything
785  */
786 static int
787 __btrfs_return_cluster_to_free_space(
788                              struct btrfs_block_group_cache *block_group,
789                              struct btrfs_free_cluster *cluster)
790 {
791         struct btrfs_free_space *entry;
792         struct rb_node *node;
793         bool bitmap;
794
795         spin_lock(&cluster->lock);
796         if (cluster->block_group != block_group)
797                 goto out;
798
799         bitmap = cluster->points_to_bitmap;
800         cluster->block_group = NULL;
801         cluster->window_start = 0;
802         list_del_init(&cluster->block_group_list);
803         cluster->points_to_bitmap = false;
804
805         if (bitmap)
806                 goto out;
807
808         node = rb_first(&cluster->root);
809         while (node) {
810                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
811                 node = rb_next(&entry->offset_index);
812                 rb_erase(&entry->offset_index, &cluster->root);
813                 BUG_ON(entry->bitmap);
814                 tree_insert_offset(&block_group->free_space_offset,
815                                    entry->offset, &entry->offset_index, 0);
816         }
817         cluster->root.rb_node = NULL;
818
819 out:
820         spin_unlock(&cluster->lock);
821         btrfs_put_block_group(block_group);
822         return 0;
823 }
824
825 void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
826 {
827         struct btrfs_free_space *info;
828         struct rb_node *node;
829         struct btrfs_free_cluster *cluster;
830         struct list_head *head;
831
832         spin_lock(&block_group->tree_lock);
833         while ((head = block_group->cluster_list.next) !=
834                &block_group->cluster_list) {
835                 cluster = list_entry(head, struct btrfs_free_cluster,
836                                      block_group_list);
837
838                 WARN_ON(cluster->block_group != block_group);
839                 __btrfs_return_cluster_to_free_space(block_group, cluster);
840                 if (need_resched()) {
841                         spin_unlock(&block_group->tree_lock);
842                         cond_resched();
843                         spin_lock(&block_group->tree_lock);
844                 }
845         }
846
847         while ((node = rb_last(&block_group->free_space_offset)) != NULL) {
848                 info = rb_entry(node, struct btrfs_free_space, offset_index);
849                 unlink_free_space(block_group, info);
850                 if (info->bitmap)
851                         kfree(info->bitmap);
852                 kfree(info);
853                 if (need_resched()) {
854                         spin_unlock(&block_group->tree_lock);
855                         cond_resched();
856                         spin_lock(&block_group->tree_lock);
857                 }
858         }
859
860         spin_unlock(&block_group->tree_lock);
861 }
862
863 u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
864                                u64 offset, u64 bytes, u64 empty_size)
865 {
866         struct btrfs_free_space *entry = NULL;
867         u64 bytes_search = bytes + empty_size;
868         u64 ret = 0;
869
870         spin_lock(&block_group->tree_lock);
871         entry = find_free_space(block_group, &offset, &bytes_search, 0);
872         if (!entry)
873                 goto out;
874
875         ret = offset;
876         if (entry->bitmap) {
877                 bitmap_clear_bits(block_group, entry, offset, bytes);
878                 if (!entry->bytes) {
879                         unlink_free_space(block_group, entry);
880                         kfree(entry->bitmap);
881                         kfree(entry);
882                         block_group->total_bitmaps--;
883                         recalculate_thresholds(block_group);
884                 }
885         } else {
886                 unlink_free_space(block_group, entry);
887                 entry->offset += bytes;
888                 entry->bytes -= bytes;
889                 if (!entry->bytes)
890                         kfree(entry);
891                 else
892                         link_free_space(block_group, entry);
893         }
894
895 out:
896         spin_unlock(&block_group->tree_lock);
897
898         return ret;
899 }
900
901 /*
902  * given a cluster, put all of its extents back into the free space
903  * cache.  If a block group is passed, this function will only free
904  * a cluster that belongs to the passed block group.
905  *
906  * Otherwise, it'll get a reference on the block group pointed to by the
907  * cluster and remove the cluster from it.
908  */
909 int btrfs_return_cluster_to_free_space(
910                                struct btrfs_block_group_cache *block_group,
911                                struct btrfs_free_cluster *cluster)
912 {
913         int ret;
914
915         /* first, get a safe pointer to the block group */
916         spin_lock(&cluster->lock);
917         if (!block_group) {
918                 block_group = cluster->block_group;
919                 if (!block_group) {
920                         spin_unlock(&cluster->lock);
921                         return 0;
922                 }
923         } else if (cluster->block_group != block_group) {
924                 /* someone else has already freed it don't redo their work */
925                 spin_unlock(&cluster->lock);
926                 return 0;
927         }
928         atomic_inc(&block_group->count);
929         spin_unlock(&cluster->lock);
930
931         /* now return any extents the cluster had on it */
932         spin_lock(&block_group->tree_lock);
933         ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
934         spin_unlock(&block_group->tree_lock);
935
936         /* finally drop our ref */
937         btrfs_put_block_group(block_group);
938         return ret;
939 }
940
941 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
942                                    struct btrfs_free_cluster *cluster,
943                                    u64 bytes, u64 min_start)
944 {
945         struct btrfs_free_space *entry;
946         int err;
947         u64 search_start = cluster->window_start;
948         u64 search_bytes = bytes;
949         u64 ret = 0;
950
951         spin_lock(&block_group->tree_lock);
952         spin_lock(&cluster->lock);
953
954         if (!cluster->points_to_bitmap)
955                 goto out;
956
957         if (cluster->block_group != block_group)
958                 goto out;
959
960         entry = tree_search_offset(block_group, search_start, 0, 0);
961
962         if (!entry || !entry->bitmap)
963                 goto out;
964
965         search_start = min_start;
966         search_bytes = bytes;
967
968         err = search_bitmap(block_group, entry, &search_start,
969                             &search_bytes);
970         if (err)
971                 goto out;
972
973         ret = search_start;
974         bitmap_clear_bits(block_group, entry, ret, bytes);
975 out:
976         spin_unlock(&cluster->lock);
977         spin_unlock(&block_group->tree_lock);
978
979         return ret;
980 }
981
982 /*
983  * given a cluster, try to allocate 'bytes' from it, returns 0
984  * if it couldn't find anything suitably large, or a logical disk offset
985  * if things worked out
986  */
987 u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
988                              struct btrfs_free_cluster *cluster, u64 bytes,
989                              u64 min_start)
990 {
991         struct btrfs_free_space *entry = NULL;
992         struct rb_node *node;
993         u64 ret = 0;
994
995         if (cluster->points_to_bitmap)
996                 return btrfs_alloc_from_bitmap(block_group, cluster, bytes,
997                                                min_start);
998
999         spin_lock(&cluster->lock);
1000         if (bytes > cluster->max_size)
1001                 goto out;
1002
1003         if (cluster->block_group != block_group)
1004                 goto out;
1005
1006         node = rb_first(&cluster->root);
1007         if (!node)
1008                 goto out;
1009
1010         entry = rb_entry(node, struct btrfs_free_space, offset_index);
1011
1012         while(1) {
1013                 if (entry->bytes < bytes || entry->offset < min_start) {
1014                         struct rb_node *node;
1015
1016                         node = rb_next(&entry->offset_index);
1017                         if (!node)
1018                                 break;
1019                         entry = rb_entry(node, struct btrfs_free_space,
1020                                          offset_index);
1021                         continue;
1022                 }
1023                 ret = entry->offset;
1024
1025                 entry->offset += bytes;
1026                 entry->bytes -= bytes;
1027
1028                 if (entry->bytes == 0) {
1029                         rb_erase(&entry->offset_index, &cluster->root);
1030                         kfree(entry);
1031                 }
1032                 break;
1033         }
1034 out:
1035         spin_unlock(&cluster->lock);
1036
1037         return ret;
1038 }
1039
1040 static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
1041                                 struct btrfs_free_space *entry,
1042                                 struct btrfs_free_cluster *cluster,
1043                                 u64 offset, u64 bytes, u64 min_bytes)
1044 {
1045         unsigned long next_zero;
1046         unsigned long i;
1047         unsigned long search_bits;
1048         unsigned long total_bits;
1049         unsigned long found_bits;
1050         unsigned long start = 0;
1051         unsigned long total_found = 0;
1052         bool found = false;
1053
1054         i = offset_to_bit(entry->offset, block_group->sectorsize,
1055                           max_t(u64, offset, entry->offset));
1056         search_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
1057         total_bits = bytes_to_bits(bytes, block_group->sectorsize);
1058
1059 again:
1060         found_bits = 0;
1061         for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i);
1062              i < BITS_PER_BITMAP;
1063              i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
1064                 next_zero = find_next_zero_bit(entry->bitmap,
1065                                                BITS_PER_BITMAP, i);
1066                 if (next_zero - i >= search_bits) {
1067                         found_bits = next_zero - i;
1068                         break;
1069                 }
1070                 i = next_zero;
1071         }
1072
1073         if (!found_bits)
1074                 return -1;
1075
1076         if (!found) {
1077                 start = i;
1078                 found = true;
1079         }
1080
1081         total_found += found_bits;
1082
1083         if (cluster->max_size < found_bits * block_group->sectorsize)
1084                 cluster->max_size = found_bits * block_group->sectorsize;
1085
1086         if (total_found < total_bits) {
1087                 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
1088                 if (i - start > total_bits * 2) {
1089                         total_found = 0;
1090                         cluster->max_size = 0;
1091                         found = false;
1092                 }
1093                 goto again;
1094         }
1095
1096         cluster->window_start = start * block_group->sectorsize +
1097                 entry->offset;
1098         cluster->points_to_bitmap = true;
1099
1100         return 0;
1101 }
1102
1103 /*
1104  * here we try to find a cluster of blocks in a block group.  The goal
1105  * is to find at least bytes free and up to empty_size + bytes free.
1106  * We might not find them all in one contiguous area.
1107  *
1108  * returns zero and sets up cluster if things worked out, otherwise
1109  * it returns -enospc
1110  */
1111 int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
1112                              struct btrfs_root *root,
1113                              struct btrfs_block_group_cache *block_group,
1114                              struct btrfs_free_cluster *cluster,
1115                              u64 offset, u64 bytes, u64 empty_size)
1116 {
1117         struct btrfs_free_space *entry = NULL;
1118         struct rb_node *node;
1119         struct btrfs_free_space *next;
1120         struct btrfs_free_space *last = NULL;
1121         u64 min_bytes;
1122         u64 window_start;
1123         u64 window_free;
1124         u64 max_extent = 0;
1125         bool found_bitmap = false;
1126         int ret;
1127
1128         /* for metadata, allow allocates with more holes */
1129         if (btrfs_test_opt(root, SSD_SPREAD)) {
1130                 min_bytes = bytes + empty_size;
1131         } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
1132                 /*
1133                  * we want to do larger allocations when we are
1134                  * flushing out the delayed refs, it helps prevent
1135                  * making more work as we go along.
1136                  */
1137                 if (trans->transaction->delayed_refs.flushing)
1138                         min_bytes = max(bytes, (bytes + empty_size) >> 1);
1139                 else
1140                         min_bytes = max(bytes, (bytes + empty_size) >> 4);
1141         } else
1142                 min_bytes = max(bytes, (bytes + empty_size) >> 2);
1143
1144         spin_lock(&block_group->tree_lock);
1145         spin_lock(&cluster->lock);
1146
1147         /* someone already found a cluster, hooray */
1148         if (cluster->block_group) {
1149                 ret = 0;
1150                 goto out;
1151         }
1152 again:
1153         entry = tree_search_offset(block_group, offset, found_bitmap, 1);
1154         if (!entry) {
1155                 ret = -ENOSPC;
1156                 goto out;
1157         }
1158
1159         /*
1160          * If found_bitmap is true, we exhausted our search for extent entries,
1161          * and we just want to search all of the bitmaps that we can find, and
1162          * ignore any extent entries we find.
1163          */
1164         while (entry->bitmap || found_bitmap ||
1165                (!entry->bitmap && entry->bytes < min_bytes)) {
1166                 struct rb_node *node = rb_next(&entry->offset_index);
1167
1168                 if (entry->bitmap && entry->bytes > bytes + empty_size) {
1169                         ret = btrfs_bitmap_cluster(block_group, entry, cluster,
1170                                                    offset, bytes + empty_size,
1171                                                    min_bytes);
1172                         if (!ret)
1173                                 goto got_it;
1174                 }
1175
1176                 if (!node) {
1177                         ret = -ENOSPC;
1178                         goto out;
1179                 }
1180                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1181         }
1182
1183         /*
1184          * We already searched all the extent entries from the passed in offset
1185          * to the end and didn't find enough space for the cluster, and we also
1186          * didn't find any bitmaps that met our criteria, just go ahead and exit
1187          */
1188         if (found_bitmap) {
1189                 ret = -ENOSPC;
1190                 goto out;
1191         }
1192
1193         cluster->points_to_bitmap = false;
1194         window_start = entry->offset;
1195         window_free = entry->bytes;
1196         last = entry;
1197         max_extent = entry->bytes;
1198
1199         while (1) {
1200                 /* out window is just right, lets fill it */
1201                 if (window_free >= bytes + empty_size)
1202                         break;
1203
1204                 node = rb_next(&last->offset_index);
1205                 if (!node) {
1206                         if (found_bitmap)
1207                                 goto again;
1208                         ret = -ENOSPC;
1209                         goto out;
1210                 }
1211                 next = rb_entry(node, struct btrfs_free_space, offset_index);
1212
1213                 /*
1214                  * we found a bitmap, so if this search doesn't result in a
1215                  * cluster, we know to go and search again for the bitmaps and
1216                  * start looking for space there
1217                  */
1218                 if (next->bitmap) {
1219                         if (!found_bitmap)
1220                                 offset = next->offset;
1221                         found_bitmap = true;
1222                         last = next;
1223                         continue;
1224                 }
1225
1226                 /*
1227                  * we haven't filled the empty size and the window is
1228                  * very large.  reset and try again
1229                  */
1230                 if (next->offset - (last->offset + last->bytes) > 128 * 1024 ||
1231                     next->offset - window_start > (bytes + empty_size) * 2) {
1232                         entry = next;
1233                         window_start = entry->offset;
1234                         window_free = entry->bytes;
1235                         last = entry;
1236                         max_extent = 0;
1237                 } else {
1238                         last = next;
1239                         window_free += next->bytes;
1240                         if (entry->bytes > max_extent)
1241                                 max_extent = entry->bytes;
1242                 }
1243         }
1244
1245         cluster->window_start = entry->offset;
1246
1247         /*
1248          * now we've found our entries, pull them out of the free space
1249          * cache and put them into the cluster rbtree
1250          *
1251          * The cluster includes an rbtree, but only uses the offset index
1252          * of each free space cache entry.
1253          */
1254         while (1) {
1255                 node = rb_next(&entry->offset_index);
1256                 if (entry->bitmap && node) {
1257                         entry = rb_entry(node, struct btrfs_free_space,
1258                                          offset_index);
1259                         continue;
1260                 } else if (entry->bitmap && !node) {
1261                         break;
1262                 }
1263
1264                 rb_erase(&entry->offset_index, &block_group->free_space_offset);
1265                 ret = tree_insert_offset(&cluster->root, entry->offset,
1266                                          &entry->offset_index, 0);
1267                 BUG_ON(ret);
1268
1269                 if (!node || entry == last)
1270                         break;
1271
1272                 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1273         }
1274
1275         cluster->max_size = max_extent;
1276 got_it:
1277         ret = 0;
1278         atomic_inc(&block_group->count);
1279         list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
1280         cluster->block_group = block_group;
1281 out:
1282         spin_unlock(&cluster->lock);
1283         spin_unlock(&block_group->tree_lock);
1284
1285         return ret;
1286 }
1287
1288 /*
1289  * simple code to zero out a cluster
1290  */
1291 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
1292 {
1293         spin_lock_init(&cluster->lock);
1294         spin_lock_init(&cluster->refill_lock);
1295         cluster->root.rb_node = NULL;
1296         cluster->max_size = 0;
1297         cluster->points_to_bitmap = false;
1298         INIT_LIST_HEAD(&cluster->block_group_list);
1299         cluster->block_group = NULL;
1300 }
1301