]> git.openfabrics.org - ~shefty/rdma-dev.git/blob - fs/btrfs/scrub.c
Btrfs: add code to scrub to copy read data to another disk
[~shefty/rdma-dev.git] / fs / btrfs / scrub.c
1 /*
2  * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/blkdev.h>
20 #include <linux/ratelimit.h>
21 #include "ctree.h"
22 #include "volumes.h"
23 #include "disk-io.h"
24 #include "ordered-data.h"
25 #include "transaction.h"
26 #include "backref.h"
27 #include "extent_io.h"
28 #include "dev-replace.h"
29 #include "check-integrity.h"
30 #include "rcu-string.h"
31
32 /*
33  * This is only the first step towards a full-features scrub. It reads all
34  * extent and super block and verifies the checksums. In case a bad checksum
35  * is found or the extent cannot be read, good data will be written back if
36  * any can be found.
37  *
38  * Future enhancements:
39  *  - In case an unrepairable extent is encountered, track which files are
40  *    affected and report them
41  *  - track and record media errors, throw out bad devices
42  *  - add a mode to also read unallocated space
43  */
44
45 struct scrub_block;
46 struct scrub_ctx;
47
48 /*
49  * the following three values only influence the performance.
50  * The last one configures the number of parallel and outstanding I/O
51  * operations. The first two values configure an upper limit for the number
52  * of (dynamically allocated) pages that are added to a bio.
53  */
54 #define SCRUB_PAGES_PER_RD_BIO  32      /* 128k per bio */
55 #define SCRUB_PAGES_PER_WR_BIO  32      /* 128k per bio */
56 #define SCRUB_BIOS_PER_SCTX     64      /* 8MB per device in flight */
57
58 /*
59  * the following value times PAGE_SIZE needs to be large enough to match the
60  * largest node/leaf/sector size that shall be supported.
61  * Values larger than BTRFS_STRIPE_LEN are not supported.
62  */
63 #define SCRUB_MAX_PAGES_PER_BLOCK       16      /* 64k per node/leaf/sector */
64
65 struct scrub_page {
66         struct scrub_block      *sblock;
67         struct page             *page;
68         struct btrfs_device     *dev;
69         u64                     flags;  /* extent flags */
70         u64                     generation;
71         u64                     logical;
72         u64                     physical;
73         u64                     physical_for_dev_replace;
74         atomic_t                ref_count;
75         struct {
76                 unsigned int    mirror_num:8;
77                 unsigned int    have_csum:1;
78                 unsigned int    io_error:1;
79         };
80         u8                      csum[BTRFS_CSUM_SIZE];
81 };
82
83 struct scrub_bio {
84         int                     index;
85         struct scrub_ctx        *sctx;
86         struct btrfs_device     *dev;
87         struct bio              *bio;
88         int                     err;
89         u64                     logical;
90         u64                     physical;
91 #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
92         struct scrub_page       *pagev[SCRUB_PAGES_PER_WR_BIO];
93 #else
94         struct scrub_page       *pagev[SCRUB_PAGES_PER_RD_BIO];
95 #endif
96         int                     page_count;
97         int                     next_free;
98         struct btrfs_work       work;
99 };
100
101 struct scrub_block {
102         struct scrub_page       *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
103         int                     page_count;
104         atomic_t                outstanding_pages;
105         atomic_t                ref_count; /* free mem on transition to zero */
106         struct scrub_ctx        *sctx;
107         struct {
108                 unsigned int    header_error:1;
109                 unsigned int    checksum_error:1;
110                 unsigned int    no_io_error_seen:1;
111                 unsigned int    generation_error:1; /* also sets header_error */
112         };
113 };
114
115 struct scrub_wr_ctx {
116         struct scrub_bio *wr_curr_bio;
117         struct btrfs_device *tgtdev;
118         int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
119         atomic_t flush_all_writes;
120         struct mutex wr_lock;
121 };
122
123 struct scrub_ctx {
124         struct scrub_bio        *bios[SCRUB_BIOS_PER_SCTX];
125         struct btrfs_root       *dev_root;
126         int                     first_free;
127         int                     curr;
128         atomic_t                bios_in_flight;
129         atomic_t                workers_pending;
130         spinlock_t              list_lock;
131         wait_queue_head_t       list_wait;
132         u16                     csum_size;
133         struct list_head        csum_list;
134         atomic_t                cancel_req;
135         int                     readonly;
136         int                     pages_per_rd_bio;
137         u32                     sectorsize;
138         u32                     nodesize;
139         u32                     leafsize;
140
141         int                     is_dev_replace;
142         struct scrub_wr_ctx     wr_ctx;
143
144         /*
145          * statistics
146          */
147         struct btrfs_scrub_progress stat;
148         spinlock_t              stat_lock;
149 };
150
151 struct scrub_fixup_nodatasum {
152         struct scrub_ctx        *sctx;
153         struct btrfs_device     *dev;
154         u64                     logical;
155         struct btrfs_root       *root;
156         struct btrfs_work       work;
157         int                     mirror_num;
158 };
159
160 struct scrub_copy_nocow_ctx {
161         struct scrub_ctx        *sctx;
162         u64                     logical;
163         u64                     len;
164         int                     mirror_num;
165         u64                     physical_for_dev_replace;
166         struct btrfs_work       work;
167 };
168
169 struct scrub_warning {
170         struct btrfs_path       *path;
171         u64                     extent_item_size;
172         char                    *scratch_buf;
173         char                    *msg_buf;
174         const char              *errstr;
175         sector_t                sector;
176         u64                     logical;
177         struct btrfs_device     *dev;
178         int                     msg_bufsize;
179         int                     scratch_bufsize;
180 };
181
182
183 static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
184 static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
185 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
186 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
187 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
188 static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
189                                      struct btrfs_fs_info *fs_info,
190                                      struct scrub_block *original_sblock,
191                                      u64 length, u64 logical,
192                                      struct scrub_block *sblocks_for_recheck);
193 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
194                                 struct scrub_block *sblock, int is_metadata,
195                                 int have_csum, u8 *csum, u64 generation,
196                                 u16 csum_size);
197 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
198                                          struct scrub_block *sblock,
199                                          int is_metadata, int have_csum,
200                                          const u8 *csum, u64 generation,
201                                          u16 csum_size);
202 static void scrub_complete_bio_end_io(struct bio *bio, int err);
203 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
204                                              struct scrub_block *sblock_good,
205                                              int force_write);
206 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
207                                             struct scrub_block *sblock_good,
208                                             int page_num, int force_write);
209 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
210 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
211                                            int page_num);
212 static int scrub_checksum_data(struct scrub_block *sblock);
213 static int scrub_checksum_tree_block(struct scrub_block *sblock);
214 static int scrub_checksum_super(struct scrub_block *sblock);
215 static void scrub_block_get(struct scrub_block *sblock);
216 static void scrub_block_put(struct scrub_block *sblock);
217 static void scrub_page_get(struct scrub_page *spage);
218 static void scrub_page_put(struct scrub_page *spage);
219 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
220                                     struct scrub_page *spage);
221 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
222                        u64 physical, struct btrfs_device *dev, u64 flags,
223                        u64 gen, int mirror_num, u8 *csum, int force,
224                        u64 physical_for_dev_replace);
225 static void scrub_bio_end_io(struct bio *bio, int err);
226 static void scrub_bio_end_io_worker(struct btrfs_work *work);
227 static void scrub_block_complete(struct scrub_block *sblock);
228 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
229                                u64 extent_logical, u64 extent_len,
230                                u64 *extent_physical,
231                                struct btrfs_device **extent_dev,
232                                int *extent_mirror_num);
233 static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
234                               struct scrub_wr_ctx *wr_ctx,
235                               struct btrfs_fs_info *fs_info,
236                               struct btrfs_device *dev,
237                               int is_dev_replace);
238 static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
239 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
240                                     struct scrub_page *spage);
241 static void scrub_wr_submit(struct scrub_ctx *sctx);
242 static void scrub_wr_bio_end_io(struct bio *bio, int err);
243 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
244 static int write_page_nocow(struct scrub_ctx *sctx,
245                             u64 physical_for_dev_replace, struct page *page);
246 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
247                                       void *ctx);
248 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
249                             int mirror_num, u64 physical_for_dev_replace);
250 static void copy_nocow_pages_worker(struct btrfs_work *work);
251
252
253 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
254 {
255         atomic_inc(&sctx->bios_in_flight);
256 }
257
258 static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
259 {
260         atomic_dec(&sctx->bios_in_flight);
261         wake_up(&sctx->list_wait);
262 }
263
264 /*
265  * used for workers that require transaction commits (i.e., for the
266  * NOCOW case)
267  */
268 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
269 {
270         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
271
272         /*
273          * increment scrubs_running to prevent cancel requests from
274          * completing as long as a worker is running. we must also
275          * increment scrubs_paused to prevent deadlocking on pause
276          * requests used for transactions commits (as the worker uses a
277          * transaction context). it is safe to regard the worker
278          * as paused for all matters practical. effectively, we only
279          * avoid cancellation requests from completing.
280          */
281         mutex_lock(&fs_info->scrub_lock);
282         atomic_inc(&fs_info->scrubs_running);
283         atomic_inc(&fs_info->scrubs_paused);
284         mutex_unlock(&fs_info->scrub_lock);
285         atomic_inc(&sctx->workers_pending);
286 }
287
288 /* used for workers that require transaction commits */
289 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
290 {
291         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
292
293         /*
294          * see scrub_pending_trans_workers_inc() why we're pretending
295          * to be paused in the scrub counters
296          */
297         mutex_lock(&fs_info->scrub_lock);
298         atomic_dec(&fs_info->scrubs_running);
299         atomic_dec(&fs_info->scrubs_paused);
300         mutex_unlock(&fs_info->scrub_lock);
301         atomic_dec(&sctx->workers_pending);
302         wake_up(&fs_info->scrub_pause_wait);
303         wake_up(&sctx->list_wait);
304 }
305
306 static void scrub_free_csums(struct scrub_ctx *sctx)
307 {
308         while (!list_empty(&sctx->csum_list)) {
309                 struct btrfs_ordered_sum *sum;
310                 sum = list_first_entry(&sctx->csum_list,
311                                        struct btrfs_ordered_sum, list);
312                 list_del(&sum->list);
313                 kfree(sum);
314         }
315 }
316
317 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
318 {
319         int i;
320
321         if (!sctx)
322                 return;
323
324         scrub_free_wr_ctx(&sctx->wr_ctx);
325
326         /* this can happen when scrub is cancelled */
327         if (sctx->curr != -1) {
328                 struct scrub_bio *sbio = sctx->bios[sctx->curr];
329
330                 for (i = 0; i < sbio->page_count; i++) {
331                         WARN_ON(!sbio->pagev[i]->page);
332                         scrub_block_put(sbio->pagev[i]->sblock);
333                 }
334                 bio_put(sbio->bio);
335         }
336
337         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
338                 struct scrub_bio *sbio = sctx->bios[i];
339
340                 if (!sbio)
341                         break;
342                 kfree(sbio);
343         }
344
345         scrub_free_csums(sctx);
346         kfree(sctx);
347 }
348
349 static noinline_for_stack
350 struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
351 {
352         struct scrub_ctx *sctx;
353         int             i;
354         struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
355         int pages_per_rd_bio;
356         int ret;
357
358         /*
359          * the setting of pages_per_rd_bio is correct for scrub but might
360          * be wrong for the dev_replace code where we might read from
361          * different devices in the initial huge bios. However, that
362          * code is able to correctly handle the case when adding a page
363          * to a bio fails.
364          */
365         if (dev->bdev)
366                 pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
367                                          bio_get_nr_vecs(dev->bdev));
368         else
369                 pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
370         sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
371         if (!sctx)
372                 goto nomem;
373         sctx->is_dev_replace = is_dev_replace;
374         sctx->pages_per_rd_bio = pages_per_rd_bio;
375         sctx->curr = -1;
376         sctx->dev_root = dev->dev_root;
377         for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
378                 struct scrub_bio *sbio;
379
380                 sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
381                 if (!sbio)
382                         goto nomem;
383                 sctx->bios[i] = sbio;
384
385                 sbio->index = i;
386                 sbio->sctx = sctx;
387                 sbio->page_count = 0;
388                 sbio->work.func = scrub_bio_end_io_worker;
389
390                 if (i != SCRUB_BIOS_PER_SCTX - 1)
391                         sctx->bios[i]->next_free = i + 1;
392                 else
393                         sctx->bios[i]->next_free = -1;
394         }
395         sctx->first_free = 0;
396         sctx->nodesize = dev->dev_root->nodesize;
397         sctx->leafsize = dev->dev_root->leafsize;
398         sctx->sectorsize = dev->dev_root->sectorsize;
399         atomic_set(&sctx->bios_in_flight, 0);
400         atomic_set(&sctx->workers_pending, 0);
401         atomic_set(&sctx->cancel_req, 0);
402         sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
403         INIT_LIST_HEAD(&sctx->csum_list);
404
405         spin_lock_init(&sctx->list_lock);
406         spin_lock_init(&sctx->stat_lock);
407         init_waitqueue_head(&sctx->list_wait);
408
409         ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
410                                  fs_info->dev_replace.tgtdev, is_dev_replace);
411         if (ret) {
412                 scrub_free_ctx(sctx);
413                 return ERR_PTR(ret);
414         }
415         return sctx;
416
417 nomem:
418         scrub_free_ctx(sctx);
419         return ERR_PTR(-ENOMEM);
420 }
421
422 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
423                                      void *warn_ctx)
424 {
425         u64 isize;
426         u32 nlink;
427         int ret;
428         int i;
429         struct extent_buffer *eb;
430         struct btrfs_inode_item *inode_item;
431         struct scrub_warning *swarn = warn_ctx;
432         struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
433         struct inode_fs_paths *ipath = NULL;
434         struct btrfs_root *local_root;
435         struct btrfs_key root_key;
436
437         root_key.objectid = root;
438         root_key.type = BTRFS_ROOT_ITEM_KEY;
439         root_key.offset = (u64)-1;
440         local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
441         if (IS_ERR(local_root)) {
442                 ret = PTR_ERR(local_root);
443                 goto err;
444         }
445
446         ret = inode_item_info(inum, 0, local_root, swarn->path);
447         if (ret) {
448                 btrfs_release_path(swarn->path);
449                 goto err;
450         }
451
452         eb = swarn->path->nodes[0];
453         inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
454                                         struct btrfs_inode_item);
455         isize = btrfs_inode_size(eb, inode_item);
456         nlink = btrfs_inode_nlink(eb, inode_item);
457         btrfs_release_path(swarn->path);
458
459         ipath = init_ipath(4096, local_root, swarn->path);
460         if (IS_ERR(ipath)) {
461                 ret = PTR_ERR(ipath);
462                 ipath = NULL;
463                 goto err;
464         }
465         ret = paths_from_inode(inum, ipath);
466
467         if (ret < 0)
468                 goto err;
469
470         /*
471          * we deliberately ignore the bit ipath might have been too small to
472          * hold all of the paths here
473          */
474         for (i = 0; i < ipath->fspath->elem_cnt; ++i)
475                 printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev "
476                         "%s, sector %llu, root %llu, inode %llu, offset %llu, "
477                         "length %llu, links %u (path: %s)\n", swarn->errstr,
478                         swarn->logical, rcu_str_deref(swarn->dev->name),
479                         (unsigned long long)swarn->sector, root, inum, offset,
480                         min(isize - offset, (u64)PAGE_SIZE), nlink,
481                         (char *)(unsigned long)ipath->fspath->val[i]);
482
483         free_ipath(ipath);
484         return 0;
485
486 err:
487         printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev "
488                 "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
489                 "resolving failed with ret=%d\n", swarn->errstr,
490                 swarn->logical, rcu_str_deref(swarn->dev->name),
491                 (unsigned long long)swarn->sector, root, inum, offset, ret);
492
493         free_ipath(ipath);
494         return 0;
495 }
496
497 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
498 {
499         struct btrfs_device *dev;
500         struct btrfs_fs_info *fs_info;
501         struct btrfs_path *path;
502         struct btrfs_key found_key;
503         struct extent_buffer *eb;
504         struct btrfs_extent_item *ei;
505         struct scrub_warning swarn;
506         unsigned long ptr = 0;
507         u64 extent_item_pos;
508         u64 flags = 0;
509         u64 ref_root;
510         u32 item_size;
511         u8 ref_level;
512         const int bufsize = 4096;
513         int ret;
514
515         WARN_ON(sblock->page_count < 1);
516         dev = sblock->pagev[0]->dev;
517         fs_info = sblock->sctx->dev_root->fs_info;
518
519         path = btrfs_alloc_path();
520
521         swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
522         swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
523         swarn.sector = (sblock->pagev[0]->physical) >> 9;
524         swarn.logical = sblock->pagev[0]->logical;
525         swarn.errstr = errstr;
526         swarn.dev = NULL;
527         swarn.msg_bufsize = bufsize;
528         swarn.scratch_bufsize = bufsize;
529
530         if (!path || !swarn.scratch_buf || !swarn.msg_buf)
531                 goto out;
532
533         ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
534                                   &flags);
535         if (ret < 0)
536                 goto out;
537
538         extent_item_pos = swarn.logical - found_key.objectid;
539         swarn.extent_item_size = found_key.offset;
540
541         eb = path->nodes[0];
542         ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
543         item_size = btrfs_item_size_nr(eb, path->slots[0]);
544         btrfs_release_path(path);
545
546         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
547                 do {
548                         ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
549                                                         &ref_root, &ref_level);
550                         printk_in_rcu(KERN_WARNING
551                                 "btrfs: %s at logical %llu on dev %s, "
552                                 "sector %llu: metadata %s (level %d) in tree "
553                                 "%llu\n", errstr, swarn.logical,
554                                 rcu_str_deref(dev->name),
555                                 (unsigned long long)swarn.sector,
556                                 ref_level ? "node" : "leaf",
557                                 ret < 0 ? -1 : ref_level,
558                                 ret < 0 ? -1 : ref_root);
559                 } while (ret != 1);
560         } else {
561                 swarn.path = path;
562                 swarn.dev = dev;
563                 iterate_extent_inodes(fs_info, found_key.objectid,
564                                         extent_item_pos, 1,
565                                         scrub_print_warning_inode, &swarn);
566         }
567
568 out:
569         btrfs_free_path(path);
570         kfree(swarn.scratch_buf);
571         kfree(swarn.msg_buf);
572 }
573
574 static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
575 {
576         struct page *page = NULL;
577         unsigned long index;
578         struct scrub_fixup_nodatasum *fixup = fixup_ctx;
579         int ret;
580         int corrected = 0;
581         struct btrfs_key key;
582         struct inode *inode = NULL;
583         u64 end = offset + PAGE_SIZE - 1;
584         struct btrfs_root *local_root;
585
586         key.objectid = root;
587         key.type = BTRFS_ROOT_ITEM_KEY;
588         key.offset = (u64)-1;
589         local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key);
590         if (IS_ERR(local_root))
591                 return PTR_ERR(local_root);
592
593         key.type = BTRFS_INODE_ITEM_KEY;
594         key.objectid = inum;
595         key.offset = 0;
596         inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL);
597         if (IS_ERR(inode))
598                 return PTR_ERR(inode);
599
600         index = offset >> PAGE_CACHE_SHIFT;
601
602         page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
603         if (!page) {
604                 ret = -ENOMEM;
605                 goto out;
606         }
607
608         if (PageUptodate(page)) {
609                 struct btrfs_fs_info *fs_info;
610                 if (PageDirty(page)) {
611                         /*
612                          * we need to write the data to the defect sector. the
613                          * data that was in that sector is not in memory,
614                          * because the page was modified. we must not write the
615                          * modified page to that sector.
616                          *
617                          * TODO: what could be done here: wait for the delalloc
618                          *       runner to write out that page (might involve
619                          *       COW) and see whether the sector is still
620                          *       referenced afterwards.
621                          *
622                          * For the meantime, we'll treat this error
623                          * incorrectable, although there is a chance that a
624                          * later scrub will find the bad sector again and that
625                          * there's no dirty page in memory, then.
626                          */
627                         ret = -EIO;
628                         goto out;
629                 }
630                 fs_info = BTRFS_I(inode)->root->fs_info;
631                 ret = repair_io_failure(fs_info, offset, PAGE_SIZE,
632                                         fixup->logical, page,
633                                         fixup->mirror_num);
634                 unlock_page(page);
635                 corrected = !ret;
636         } else {
637                 /*
638                  * we need to get good data first. the general readpage path
639                  * will call repair_io_failure for us, we just have to make
640                  * sure we read the bad mirror.
641                  */
642                 ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
643                                         EXTENT_DAMAGED, GFP_NOFS);
644                 if (ret) {
645                         /* set_extent_bits should give proper error */
646                         WARN_ON(ret > 0);
647                         if (ret > 0)
648                                 ret = -EFAULT;
649                         goto out;
650                 }
651
652                 ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
653                                                 btrfs_get_extent,
654                                                 fixup->mirror_num);
655                 wait_on_page_locked(page);
656
657                 corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
658                                                 end, EXTENT_DAMAGED, 0, NULL);
659                 if (!corrected)
660                         clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
661                                                 EXTENT_DAMAGED, GFP_NOFS);
662         }
663
664 out:
665         if (page)
666                 put_page(page);
667         if (inode)
668                 iput(inode);
669
670         if (ret < 0)
671                 return ret;
672
673         if (ret == 0 && corrected) {
674                 /*
675                  * we only need to call readpage for one of the inodes belonging
676                  * to this extent. so make iterate_extent_inodes stop
677                  */
678                 return 1;
679         }
680
681         return -EIO;
682 }
683
684 static void scrub_fixup_nodatasum(struct btrfs_work *work)
685 {
686         int ret;
687         struct scrub_fixup_nodatasum *fixup;
688         struct scrub_ctx *sctx;
689         struct btrfs_trans_handle *trans = NULL;
690         struct btrfs_fs_info *fs_info;
691         struct btrfs_path *path;
692         int uncorrectable = 0;
693
694         fixup = container_of(work, struct scrub_fixup_nodatasum, work);
695         sctx = fixup->sctx;
696         fs_info = fixup->root->fs_info;
697
698         path = btrfs_alloc_path();
699         if (!path) {
700                 spin_lock(&sctx->stat_lock);
701                 ++sctx->stat.malloc_errors;
702                 spin_unlock(&sctx->stat_lock);
703                 uncorrectable = 1;
704                 goto out;
705         }
706
707         trans = btrfs_join_transaction(fixup->root);
708         if (IS_ERR(trans)) {
709                 uncorrectable = 1;
710                 goto out;
711         }
712
713         /*
714          * the idea is to trigger a regular read through the standard path. we
715          * read a page from the (failed) logical address by specifying the
716          * corresponding copynum of the failed sector. thus, that readpage is
717          * expected to fail.
718          * that is the point where on-the-fly error correction will kick in
719          * (once it's finished) and rewrite the failed sector if a good copy
720          * can be found.
721          */
722         ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
723                                                 path, scrub_fixup_readpage,
724                                                 fixup);
725         if (ret < 0) {
726                 uncorrectable = 1;
727                 goto out;
728         }
729         WARN_ON(ret != 1);
730
731         spin_lock(&sctx->stat_lock);
732         ++sctx->stat.corrected_errors;
733         spin_unlock(&sctx->stat_lock);
734
735 out:
736         if (trans && !IS_ERR(trans))
737                 btrfs_end_transaction(trans, fixup->root);
738         if (uncorrectable) {
739                 spin_lock(&sctx->stat_lock);
740                 ++sctx->stat.uncorrectable_errors;
741                 spin_unlock(&sctx->stat_lock);
742                 btrfs_dev_replace_stats_inc(
743                         &sctx->dev_root->fs_info->dev_replace.
744                         num_uncorrectable_read_errors);
745                 printk_ratelimited_in_rcu(KERN_ERR
746                         "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
747                         (unsigned long long)fixup->logical,
748                         rcu_str_deref(fixup->dev->name));
749         }
750
751         btrfs_free_path(path);
752         kfree(fixup);
753
754         scrub_pending_trans_workers_dec(sctx);
755 }
756
757 /*
758  * scrub_handle_errored_block gets called when either verification of the
759  * pages failed or the bio failed to read, e.g. with EIO. In the latter
760  * case, this function handles all pages in the bio, even though only one
761  * may be bad.
762  * The goal of this function is to repair the errored block by using the
763  * contents of one of the mirrors.
764  */
765 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
766 {
767         struct scrub_ctx *sctx = sblock_to_check->sctx;
768         struct btrfs_device *dev;
769         struct btrfs_fs_info *fs_info;
770         u64 length;
771         u64 logical;
772         u64 generation;
773         unsigned int failed_mirror_index;
774         unsigned int is_metadata;
775         unsigned int have_csum;
776         u8 *csum;
777         struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
778         struct scrub_block *sblock_bad;
779         int ret;
780         int mirror_index;
781         int page_num;
782         int success;
783         static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
784                                       DEFAULT_RATELIMIT_BURST);
785
786         BUG_ON(sblock_to_check->page_count < 1);
787         fs_info = sctx->dev_root->fs_info;
788         length = sblock_to_check->page_count * PAGE_SIZE;
789         logical = sblock_to_check->pagev[0]->logical;
790         generation = sblock_to_check->pagev[0]->generation;
791         BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
792         failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
793         is_metadata = !(sblock_to_check->pagev[0]->flags &
794                         BTRFS_EXTENT_FLAG_DATA);
795         have_csum = sblock_to_check->pagev[0]->have_csum;
796         csum = sblock_to_check->pagev[0]->csum;
797         dev = sblock_to_check->pagev[0]->dev;
798
799         if (sctx->is_dev_replace && !is_metadata && !have_csum) {
800                 sblocks_for_recheck = NULL;
801                 goto nodatasum_case;
802         }
803
804         /*
805          * read all mirrors one after the other. This includes to
806          * re-read the extent or metadata block that failed (that was
807          * the cause that this fixup code is called) another time,
808          * page by page this time in order to know which pages
809          * caused I/O errors and which ones are good (for all mirrors).
810          * It is the goal to handle the situation when more than one
811          * mirror contains I/O errors, but the errors do not
812          * overlap, i.e. the data can be repaired by selecting the
813          * pages from those mirrors without I/O error on the
814          * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
815          * would be that mirror #1 has an I/O error on the first page,
816          * the second page is good, and mirror #2 has an I/O error on
817          * the second page, but the first page is good.
818          * Then the first page of the first mirror can be repaired by
819          * taking the first page of the second mirror, and the
820          * second page of the second mirror can be repaired by
821          * copying the contents of the 2nd page of the 1st mirror.
822          * One more note: if the pages of one mirror contain I/O
823          * errors, the checksum cannot be verified. In order to get
824          * the best data for repairing, the first attempt is to find
825          * a mirror without I/O errors and with a validated checksum.
826          * Only if this is not possible, the pages are picked from
827          * mirrors with I/O errors without considering the checksum.
828          * If the latter is the case, at the end, the checksum of the
829          * repaired area is verified in order to correctly maintain
830          * the statistics.
831          */
832
833         sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS *
834                                      sizeof(*sblocks_for_recheck),
835                                      GFP_NOFS);
836         if (!sblocks_for_recheck) {
837                 spin_lock(&sctx->stat_lock);
838                 sctx->stat.malloc_errors++;
839                 sctx->stat.read_errors++;
840                 sctx->stat.uncorrectable_errors++;
841                 spin_unlock(&sctx->stat_lock);
842                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
843                 goto out;
844         }
845
846         /* setup the context, map the logical blocks and alloc the pages */
847         ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
848                                         logical, sblocks_for_recheck);
849         if (ret) {
850                 spin_lock(&sctx->stat_lock);
851                 sctx->stat.read_errors++;
852                 sctx->stat.uncorrectable_errors++;
853                 spin_unlock(&sctx->stat_lock);
854                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
855                 goto out;
856         }
857         BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
858         sblock_bad = sblocks_for_recheck + failed_mirror_index;
859
860         /* build and submit the bios for the failed mirror, check checksums */
861         scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
862                             csum, generation, sctx->csum_size);
863
864         if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
865             sblock_bad->no_io_error_seen) {
866                 /*
867                  * the error disappeared after reading page by page, or
868                  * the area was part of a huge bio and other parts of the
869                  * bio caused I/O errors, or the block layer merged several
870                  * read requests into one and the error is caused by a
871                  * different bio (usually one of the two latter cases is
872                  * the cause)
873                  */
874                 spin_lock(&sctx->stat_lock);
875                 sctx->stat.unverified_errors++;
876                 spin_unlock(&sctx->stat_lock);
877
878                 if (sctx->is_dev_replace)
879                         scrub_write_block_to_dev_replace(sblock_bad);
880                 goto out;
881         }
882
883         if (!sblock_bad->no_io_error_seen) {
884                 spin_lock(&sctx->stat_lock);
885                 sctx->stat.read_errors++;
886                 spin_unlock(&sctx->stat_lock);
887                 if (__ratelimit(&_rs))
888                         scrub_print_warning("i/o error", sblock_to_check);
889                 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
890         } else if (sblock_bad->checksum_error) {
891                 spin_lock(&sctx->stat_lock);
892                 sctx->stat.csum_errors++;
893                 spin_unlock(&sctx->stat_lock);
894                 if (__ratelimit(&_rs))
895                         scrub_print_warning("checksum error", sblock_to_check);
896                 btrfs_dev_stat_inc_and_print(dev,
897                                              BTRFS_DEV_STAT_CORRUPTION_ERRS);
898         } else if (sblock_bad->header_error) {
899                 spin_lock(&sctx->stat_lock);
900                 sctx->stat.verify_errors++;
901                 spin_unlock(&sctx->stat_lock);
902                 if (__ratelimit(&_rs))
903                         scrub_print_warning("checksum/header error",
904                                             sblock_to_check);
905                 if (sblock_bad->generation_error)
906                         btrfs_dev_stat_inc_and_print(dev,
907                                 BTRFS_DEV_STAT_GENERATION_ERRS);
908                 else
909                         btrfs_dev_stat_inc_and_print(dev,
910                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
911         }
912
913         if (sctx->readonly && !sctx->is_dev_replace)
914                 goto did_not_correct_error;
915
916         if (!is_metadata && !have_csum) {
917                 struct scrub_fixup_nodatasum *fixup_nodatasum;
918
919 nodatasum_case:
920                 WARN_ON(sctx->is_dev_replace);
921
922                 /*
923                  * !is_metadata and !have_csum, this means that the data
924                  * might not be COW'ed, that it might be modified
925                  * concurrently. The general strategy to work on the
926                  * commit root does not help in the case when COW is not
927                  * used.
928                  */
929                 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
930                 if (!fixup_nodatasum)
931                         goto did_not_correct_error;
932                 fixup_nodatasum->sctx = sctx;
933                 fixup_nodatasum->dev = dev;
934                 fixup_nodatasum->logical = logical;
935                 fixup_nodatasum->root = fs_info->extent_root;
936                 fixup_nodatasum->mirror_num = failed_mirror_index + 1;
937                 scrub_pending_trans_workers_inc(sctx);
938                 fixup_nodatasum->work.func = scrub_fixup_nodatasum;
939                 btrfs_queue_worker(&fs_info->scrub_workers,
940                                    &fixup_nodatasum->work);
941                 goto out;
942         }
943
944         /*
945          * now build and submit the bios for the other mirrors, check
946          * checksums.
947          * First try to pick the mirror which is completely without I/O
948          * errors and also does not have a checksum error.
949          * If one is found, and if a checksum is present, the full block
950          * that is known to contain an error is rewritten. Afterwards
951          * the block is known to be corrected.
952          * If a mirror is found which is completely correct, and no
953          * checksum is present, only those pages are rewritten that had
954          * an I/O error in the block to be repaired, since it cannot be
955          * determined, which copy of the other pages is better (and it
956          * could happen otherwise that a correct page would be
957          * overwritten by a bad one).
958          */
959         for (mirror_index = 0;
960              mirror_index < BTRFS_MAX_MIRRORS &&
961              sblocks_for_recheck[mirror_index].page_count > 0;
962              mirror_index++) {
963                 struct scrub_block *sblock_other;
964
965                 if (mirror_index == failed_mirror_index)
966                         continue;
967                 sblock_other = sblocks_for_recheck + mirror_index;
968
969                 /* build and submit the bios, check checksums */
970                 scrub_recheck_block(fs_info, sblock_other, is_metadata,
971                                     have_csum, csum, generation,
972                                     sctx->csum_size);
973
974                 if (!sblock_other->header_error &&
975                     !sblock_other->checksum_error &&
976                     sblock_other->no_io_error_seen) {
977                         if (sctx->is_dev_replace) {
978                                 scrub_write_block_to_dev_replace(sblock_other);
979                         } else {
980                                 int force_write = is_metadata || have_csum;
981
982                                 ret = scrub_repair_block_from_good_copy(
983                                                 sblock_bad, sblock_other,
984                                                 force_write);
985                         }
986                         if (0 == ret)
987                                 goto corrected_error;
988                 }
989         }
990
991         /*
992          * for dev_replace, pick good pages and write to the target device.
993          */
994         if (sctx->is_dev_replace) {
995                 success = 1;
996                 for (page_num = 0; page_num < sblock_bad->page_count;
997                      page_num++) {
998                         int sub_success;
999
1000                         sub_success = 0;
1001                         for (mirror_index = 0;
1002                              mirror_index < BTRFS_MAX_MIRRORS &&
1003                              sblocks_for_recheck[mirror_index].page_count > 0;
1004                              mirror_index++) {
1005                                 struct scrub_block *sblock_other =
1006                                         sblocks_for_recheck + mirror_index;
1007                                 struct scrub_page *page_other =
1008                                         sblock_other->pagev[page_num];
1009
1010                                 if (!page_other->io_error) {
1011                                         ret = scrub_write_page_to_dev_replace(
1012                                                         sblock_other, page_num);
1013                                         if (ret == 0) {
1014                                                 /* succeeded for this page */
1015                                                 sub_success = 1;
1016                                                 break;
1017                                         } else {
1018                                                 btrfs_dev_replace_stats_inc(
1019                                                         &sctx->dev_root->
1020                                                         fs_info->dev_replace.
1021                                                         num_write_errors);
1022                                         }
1023                                 }
1024                         }
1025
1026                         if (!sub_success) {
1027                                 /*
1028                                  * did not find a mirror to fetch the page
1029                                  * from. scrub_write_page_to_dev_replace()
1030                                  * handles this case (page->io_error), by
1031                                  * filling the block with zeros before
1032                                  * submitting the write request
1033                                  */
1034                                 success = 0;
1035                                 ret = scrub_write_page_to_dev_replace(
1036                                                 sblock_bad, page_num);
1037                                 if (ret)
1038                                         btrfs_dev_replace_stats_inc(
1039                                                 &sctx->dev_root->fs_info->
1040                                                 dev_replace.num_write_errors);
1041                         }
1042                 }
1043
1044                 goto out;
1045         }
1046
1047         /*
1048          * for regular scrub, repair those pages that are errored.
1049          * In case of I/O errors in the area that is supposed to be
1050          * repaired, continue by picking good copies of those pages.
1051          * Select the good pages from mirrors to rewrite bad pages from
1052          * the area to fix. Afterwards verify the checksum of the block
1053          * that is supposed to be repaired. This verification step is
1054          * only done for the purpose of statistic counting and for the
1055          * final scrub report, whether errors remain.
1056          * A perfect algorithm could make use of the checksum and try
1057          * all possible combinations of pages from the different mirrors
1058          * until the checksum verification succeeds. For example, when
1059          * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
1060          * of mirror #2 is readable but the final checksum test fails,
1061          * then the 2nd page of mirror #3 could be tried, whether now
1062          * the final checksum succeedes. But this would be a rare
1063          * exception and is therefore not implemented. At least it is
1064          * avoided that the good copy is overwritten.
1065          * A more useful improvement would be to pick the sectors
1066          * without I/O error based on sector sizes (512 bytes on legacy
1067          * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
1068          * mirror could be repaired by taking 512 byte of a different
1069          * mirror, even if other 512 byte sectors in the same PAGE_SIZE
1070          * area are unreadable.
1071          */
1072
1073         /* can only fix I/O errors from here on */
1074         if (sblock_bad->no_io_error_seen)
1075                 goto did_not_correct_error;
1076
1077         success = 1;
1078         for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1079                 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1080
1081                 if (!page_bad->io_error)
1082                         continue;
1083
1084                 for (mirror_index = 0;
1085                      mirror_index < BTRFS_MAX_MIRRORS &&
1086                      sblocks_for_recheck[mirror_index].page_count > 0;
1087                      mirror_index++) {
1088                         struct scrub_block *sblock_other = sblocks_for_recheck +
1089                                                            mirror_index;
1090                         struct scrub_page *page_other = sblock_other->pagev[
1091                                                         page_num];
1092
1093                         if (!page_other->io_error) {
1094                                 ret = scrub_repair_page_from_good_copy(
1095                                         sblock_bad, sblock_other, page_num, 0);
1096                                 if (0 == ret) {
1097                                         page_bad->io_error = 0;
1098                                         break; /* succeeded for this page */
1099                                 }
1100                         }
1101                 }
1102
1103                 if (page_bad->io_error) {
1104                         /* did not find a mirror to copy the page from */
1105                         success = 0;
1106                 }
1107         }
1108
1109         if (success) {
1110                 if (is_metadata || have_csum) {
1111                         /*
1112                          * need to verify the checksum now that all
1113                          * sectors on disk are repaired (the write
1114                          * request for data to be repaired is on its way).
1115                          * Just be lazy and use scrub_recheck_block()
1116                          * which re-reads the data before the checksum
1117                          * is verified, but most likely the data comes out
1118                          * of the page cache.
1119                          */
1120                         scrub_recheck_block(fs_info, sblock_bad,
1121                                             is_metadata, have_csum, csum,
1122                                             generation, sctx->csum_size);
1123                         if (!sblock_bad->header_error &&
1124                             !sblock_bad->checksum_error &&
1125                             sblock_bad->no_io_error_seen)
1126                                 goto corrected_error;
1127                         else
1128                                 goto did_not_correct_error;
1129                 } else {
1130 corrected_error:
1131                         spin_lock(&sctx->stat_lock);
1132                         sctx->stat.corrected_errors++;
1133                         spin_unlock(&sctx->stat_lock);
1134                         printk_ratelimited_in_rcu(KERN_ERR
1135                                 "btrfs: fixed up error at logical %llu on dev %s\n",
1136                                 (unsigned long long)logical,
1137                                 rcu_str_deref(dev->name));
1138                 }
1139         } else {
1140 did_not_correct_error:
1141                 spin_lock(&sctx->stat_lock);
1142                 sctx->stat.uncorrectable_errors++;
1143                 spin_unlock(&sctx->stat_lock);
1144                 printk_ratelimited_in_rcu(KERN_ERR
1145                         "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",
1146                         (unsigned long long)logical,
1147                         rcu_str_deref(dev->name));
1148         }
1149
1150 out:
1151         if (sblocks_for_recheck) {
1152                 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1153                      mirror_index++) {
1154                         struct scrub_block *sblock = sblocks_for_recheck +
1155                                                      mirror_index;
1156                         int page_index;
1157
1158                         for (page_index = 0; page_index < sblock->page_count;
1159                              page_index++) {
1160                                 sblock->pagev[page_index]->sblock = NULL;
1161                                 scrub_page_put(sblock->pagev[page_index]);
1162                         }
1163                 }
1164                 kfree(sblocks_for_recheck);
1165         }
1166
1167         return 0;
1168 }
1169
1170 static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
1171                                      struct btrfs_fs_info *fs_info,
1172                                      struct scrub_block *original_sblock,
1173                                      u64 length, u64 logical,
1174                                      struct scrub_block *sblocks_for_recheck)
1175 {
1176         int page_index;
1177         int mirror_index;
1178         int ret;
1179
1180         /*
1181          * note: the two members ref_count and outstanding_pages
1182          * are not used (and not set) in the blocks that are used for
1183          * the recheck procedure
1184          */
1185
1186         page_index = 0;
1187         while (length > 0) {
1188                 u64 sublen = min_t(u64, length, PAGE_SIZE);
1189                 u64 mapped_length = sublen;
1190                 struct btrfs_bio *bbio = NULL;
1191
1192                 /*
1193                  * with a length of PAGE_SIZE, each returned stripe
1194                  * represents one mirror
1195                  */
1196                 ret = btrfs_map_block(fs_info, WRITE, logical, &mapped_length,
1197                                       &bbio, 0);
1198                 if (ret || !bbio || mapped_length < sublen) {
1199                         kfree(bbio);
1200                         return -EIO;
1201                 }
1202
1203                 BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
1204                 for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
1205                      mirror_index++) {
1206                         struct scrub_block *sblock;
1207                         struct scrub_page *page;
1208
1209                         if (mirror_index >= BTRFS_MAX_MIRRORS)
1210                                 continue;
1211
1212                         sblock = sblocks_for_recheck + mirror_index;
1213                         sblock->sctx = sctx;
1214                         page = kzalloc(sizeof(*page), GFP_NOFS);
1215                         if (!page) {
1216 leave_nomem:
1217                                 spin_lock(&sctx->stat_lock);
1218                                 sctx->stat.malloc_errors++;
1219                                 spin_unlock(&sctx->stat_lock);
1220                                 kfree(bbio);
1221                                 return -ENOMEM;
1222                         }
1223                         scrub_page_get(page);
1224                         sblock->pagev[page_index] = page;
1225                         page->logical = logical;
1226                         page->physical = bbio->stripes[mirror_index].physical;
1227                         BUG_ON(page_index >= original_sblock->page_count);
1228                         page->physical_for_dev_replace =
1229                                 original_sblock->pagev[page_index]->
1230                                 physical_for_dev_replace;
1231                         /* for missing devices, dev->bdev is NULL */
1232                         page->dev = bbio->stripes[mirror_index].dev;
1233                         page->mirror_num = mirror_index + 1;
1234                         sblock->page_count++;
1235                         page->page = alloc_page(GFP_NOFS);
1236                         if (!page->page)
1237                                 goto leave_nomem;
1238                 }
1239                 kfree(bbio);
1240                 length -= sublen;
1241                 logical += sublen;
1242                 page_index++;
1243         }
1244
1245         return 0;
1246 }
1247
1248 /*
1249  * this function will check the on disk data for checksum errors, header
1250  * errors and read I/O errors. If any I/O errors happen, the exact pages
1251  * which are errored are marked as being bad. The goal is to enable scrub
1252  * to take those pages that are not errored from all the mirrors so that
1253  * the pages that are errored in the just handled mirror can be repaired.
1254  */
1255 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1256                                 struct scrub_block *sblock, int is_metadata,
1257                                 int have_csum, u8 *csum, u64 generation,
1258                                 u16 csum_size)
1259 {
1260         int page_num;
1261
1262         sblock->no_io_error_seen = 1;
1263         sblock->header_error = 0;
1264         sblock->checksum_error = 0;
1265
1266         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1267                 struct bio *bio;
1268                 struct scrub_page *page = sblock->pagev[page_num];
1269                 DECLARE_COMPLETION_ONSTACK(complete);
1270
1271                 if (page->dev->bdev == NULL) {
1272                         page->io_error = 1;
1273                         sblock->no_io_error_seen = 0;
1274                         continue;
1275                 }
1276
1277                 WARN_ON(!page->page);
1278                 bio = bio_alloc(GFP_NOFS, 1);
1279                 if (!bio) {
1280                         page->io_error = 1;
1281                         sblock->no_io_error_seen = 0;
1282                         continue;
1283                 }
1284                 bio->bi_bdev = page->dev->bdev;
1285                 bio->bi_sector = page->physical >> 9;
1286                 bio->bi_end_io = scrub_complete_bio_end_io;
1287                 bio->bi_private = &complete;
1288
1289                 bio_add_page(bio, page->page, PAGE_SIZE, 0);
1290                 btrfsic_submit_bio(READ, bio);
1291
1292                 /* this will also unplug the queue */
1293                 wait_for_completion(&complete);
1294
1295                 page->io_error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
1296                 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
1297                         sblock->no_io_error_seen = 0;
1298                 bio_put(bio);
1299         }
1300
1301         if (sblock->no_io_error_seen)
1302                 scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
1303                                              have_csum, csum, generation,
1304                                              csum_size);
1305
1306         return;
1307 }
1308
1309 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1310                                          struct scrub_block *sblock,
1311                                          int is_metadata, int have_csum,
1312                                          const u8 *csum, u64 generation,
1313                                          u16 csum_size)
1314 {
1315         int page_num;
1316         u8 calculated_csum[BTRFS_CSUM_SIZE];
1317         u32 crc = ~(u32)0;
1318         struct btrfs_root *root = fs_info->extent_root;
1319         void *mapped_buffer;
1320
1321         WARN_ON(!sblock->pagev[0]->page);
1322         if (is_metadata) {
1323                 struct btrfs_header *h;
1324
1325                 mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1326                 h = (struct btrfs_header *)mapped_buffer;
1327
1328                 if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr) ||
1329                     memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
1330                     memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1331                            BTRFS_UUID_SIZE)) {
1332                         sblock->header_error = 1;
1333                 } else if (generation != le64_to_cpu(h->generation)) {
1334                         sblock->header_error = 1;
1335                         sblock->generation_error = 1;
1336                 }
1337                 csum = h->csum;
1338         } else {
1339                 if (!have_csum)
1340                         return;
1341
1342                 mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1343         }
1344
1345         for (page_num = 0;;) {
1346                 if (page_num == 0 && is_metadata)
1347                         crc = btrfs_csum_data(root,
1348                                 ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
1349                                 crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
1350                 else
1351                         crc = btrfs_csum_data(root, mapped_buffer, crc,
1352                                               PAGE_SIZE);
1353
1354                 kunmap_atomic(mapped_buffer);
1355                 page_num++;
1356                 if (page_num >= sblock->page_count)
1357                         break;
1358                 WARN_ON(!sblock->pagev[page_num]->page);
1359
1360                 mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
1361         }
1362
1363         btrfs_csum_final(crc, calculated_csum);
1364         if (memcmp(calculated_csum, csum, csum_size))
1365                 sblock->checksum_error = 1;
1366 }
1367
1368 static void scrub_complete_bio_end_io(struct bio *bio, int err)
1369 {
1370         complete((struct completion *)bio->bi_private);
1371 }
1372
1373 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1374                                              struct scrub_block *sblock_good,
1375                                              int force_write)
1376 {
1377         int page_num;
1378         int ret = 0;
1379
1380         for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1381                 int ret_sub;
1382
1383                 ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1384                                                            sblock_good,
1385                                                            page_num,
1386                                                            force_write);
1387                 if (ret_sub)
1388                         ret = ret_sub;
1389         }
1390
1391         return ret;
1392 }
1393
1394 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1395                                             struct scrub_block *sblock_good,
1396                                             int page_num, int force_write)
1397 {
1398         struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1399         struct scrub_page *page_good = sblock_good->pagev[page_num];
1400
1401         BUG_ON(page_bad->page == NULL);
1402         BUG_ON(page_good->page == NULL);
1403         if (force_write || sblock_bad->header_error ||
1404             sblock_bad->checksum_error || page_bad->io_error) {
1405                 struct bio *bio;
1406                 int ret;
1407                 DECLARE_COMPLETION_ONSTACK(complete);
1408
1409                 if (!page_bad->dev->bdev) {
1410                         printk_ratelimited(KERN_WARNING
1411                                 "btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n");
1412                         return -EIO;
1413                 }
1414
1415                 bio = bio_alloc(GFP_NOFS, 1);
1416                 if (!bio)
1417                         return -EIO;
1418                 bio->bi_bdev = page_bad->dev->bdev;
1419                 bio->bi_sector = page_bad->physical >> 9;
1420                 bio->bi_end_io = scrub_complete_bio_end_io;
1421                 bio->bi_private = &complete;
1422
1423                 ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
1424                 if (PAGE_SIZE != ret) {
1425                         bio_put(bio);
1426                         return -EIO;
1427                 }
1428                 btrfsic_submit_bio(WRITE, bio);
1429
1430                 /* this will also unplug the queue */
1431                 wait_for_completion(&complete);
1432                 if (!bio_flagged(bio, BIO_UPTODATE)) {
1433                         btrfs_dev_stat_inc_and_print(page_bad->dev,
1434                                 BTRFS_DEV_STAT_WRITE_ERRS);
1435                         btrfs_dev_replace_stats_inc(
1436                                 &sblock_bad->sctx->dev_root->fs_info->
1437                                 dev_replace.num_write_errors);
1438                         bio_put(bio);
1439                         return -EIO;
1440                 }
1441                 bio_put(bio);
1442         }
1443
1444         return 0;
1445 }
1446
1447 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1448 {
1449         int page_num;
1450
1451         for (page_num = 0; page_num < sblock->page_count; page_num++) {
1452                 int ret;
1453
1454                 ret = scrub_write_page_to_dev_replace(sblock, page_num);
1455                 if (ret)
1456                         btrfs_dev_replace_stats_inc(
1457                                 &sblock->sctx->dev_root->fs_info->dev_replace.
1458                                 num_write_errors);
1459         }
1460 }
1461
1462 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1463                                            int page_num)
1464 {
1465         struct scrub_page *spage = sblock->pagev[page_num];
1466
1467         BUG_ON(spage->page == NULL);
1468         if (spage->io_error) {
1469                 void *mapped_buffer = kmap_atomic(spage->page);
1470
1471                 memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
1472                 flush_dcache_page(spage->page);
1473                 kunmap_atomic(mapped_buffer);
1474         }
1475         return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1476 }
1477
1478 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1479                                     struct scrub_page *spage)
1480 {
1481         struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1482         struct scrub_bio *sbio;
1483         int ret;
1484
1485         mutex_lock(&wr_ctx->wr_lock);
1486 again:
1487         if (!wr_ctx->wr_curr_bio) {
1488                 wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
1489                                               GFP_NOFS);
1490                 if (!wr_ctx->wr_curr_bio) {
1491                         mutex_unlock(&wr_ctx->wr_lock);
1492                         return -ENOMEM;
1493                 }
1494                 wr_ctx->wr_curr_bio->sctx = sctx;
1495                 wr_ctx->wr_curr_bio->page_count = 0;
1496         }
1497         sbio = wr_ctx->wr_curr_bio;
1498         if (sbio->page_count == 0) {
1499                 struct bio *bio;
1500
1501                 sbio->physical = spage->physical_for_dev_replace;
1502                 sbio->logical = spage->logical;
1503                 sbio->dev = wr_ctx->tgtdev;
1504                 bio = sbio->bio;
1505                 if (!bio) {
1506                         bio = bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
1507                         if (!bio) {
1508                                 mutex_unlock(&wr_ctx->wr_lock);
1509                                 return -ENOMEM;
1510                         }
1511                         sbio->bio = bio;
1512                 }
1513
1514                 bio->bi_private = sbio;
1515                 bio->bi_end_io = scrub_wr_bio_end_io;
1516                 bio->bi_bdev = sbio->dev->bdev;
1517                 bio->bi_sector = sbio->physical >> 9;
1518                 sbio->err = 0;
1519         } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1520                    spage->physical_for_dev_replace ||
1521                    sbio->logical + sbio->page_count * PAGE_SIZE !=
1522                    spage->logical) {
1523                 scrub_wr_submit(sctx);
1524                 goto again;
1525         }
1526
1527         ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1528         if (ret != PAGE_SIZE) {
1529                 if (sbio->page_count < 1) {
1530                         bio_put(sbio->bio);
1531                         sbio->bio = NULL;
1532                         mutex_unlock(&wr_ctx->wr_lock);
1533                         return -EIO;
1534                 }
1535                 scrub_wr_submit(sctx);
1536                 goto again;
1537         }
1538
1539         sbio->pagev[sbio->page_count] = spage;
1540         scrub_page_get(spage);
1541         sbio->page_count++;
1542         if (sbio->page_count == wr_ctx->pages_per_wr_bio)
1543                 scrub_wr_submit(sctx);
1544         mutex_unlock(&wr_ctx->wr_lock);
1545
1546         return 0;
1547 }
1548
1549 static void scrub_wr_submit(struct scrub_ctx *sctx)
1550 {
1551         struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1552         struct scrub_bio *sbio;
1553
1554         if (!wr_ctx->wr_curr_bio)
1555                 return;
1556
1557         sbio = wr_ctx->wr_curr_bio;
1558         wr_ctx->wr_curr_bio = NULL;
1559         WARN_ON(!sbio->bio->bi_bdev);
1560         scrub_pending_bio_inc(sctx);
1561         /* process all writes in a single worker thread. Then the block layer
1562          * orders the requests before sending them to the driver which
1563          * doubled the write performance on spinning disks when measured
1564          * with Linux 3.5 */
1565         btrfsic_submit_bio(WRITE, sbio->bio);
1566 }
1567
1568 static void scrub_wr_bio_end_io(struct bio *bio, int err)
1569 {
1570         struct scrub_bio *sbio = bio->bi_private;
1571         struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
1572
1573         sbio->err = err;
1574         sbio->bio = bio;
1575
1576         sbio->work.func = scrub_wr_bio_end_io_worker;
1577         btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work);
1578 }
1579
1580 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1581 {
1582         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1583         struct scrub_ctx *sctx = sbio->sctx;
1584         int i;
1585
1586         WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
1587         if (sbio->err) {
1588                 struct btrfs_dev_replace *dev_replace =
1589                         &sbio->sctx->dev_root->fs_info->dev_replace;
1590
1591                 for (i = 0; i < sbio->page_count; i++) {
1592                         struct scrub_page *spage = sbio->pagev[i];
1593
1594                         spage->io_error = 1;
1595                         btrfs_dev_replace_stats_inc(&dev_replace->
1596                                                     num_write_errors);
1597                 }
1598         }
1599
1600         for (i = 0; i < sbio->page_count; i++)
1601                 scrub_page_put(sbio->pagev[i]);
1602
1603         bio_put(sbio->bio);
1604         kfree(sbio);
1605         scrub_pending_bio_dec(sctx);
1606 }
1607
1608 static int scrub_checksum(struct scrub_block *sblock)
1609 {
1610         u64 flags;
1611         int ret;
1612
1613         WARN_ON(sblock->page_count < 1);
1614         flags = sblock->pagev[0]->flags;
1615         ret = 0;
1616         if (flags & BTRFS_EXTENT_FLAG_DATA)
1617                 ret = scrub_checksum_data(sblock);
1618         else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1619                 ret = scrub_checksum_tree_block(sblock);
1620         else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1621                 (void)scrub_checksum_super(sblock);
1622         else
1623                 WARN_ON(1);
1624         if (ret)
1625                 scrub_handle_errored_block(sblock);
1626
1627         return ret;
1628 }
1629
1630 static int scrub_checksum_data(struct scrub_block *sblock)
1631 {
1632         struct scrub_ctx *sctx = sblock->sctx;
1633         u8 csum[BTRFS_CSUM_SIZE];
1634         u8 *on_disk_csum;
1635         struct page *page;
1636         void *buffer;
1637         u32 crc = ~(u32)0;
1638         int fail = 0;
1639         struct btrfs_root *root = sctx->dev_root;
1640         u64 len;
1641         int index;
1642
1643         BUG_ON(sblock->page_count < 1);
1644         if (!sblock->pagev[0]->have_csum)
1645                 return 0;
1646
1647         on_disk_csum = sblock->pagev[0]->csum;
1648         page = sblock->pagev[0]->page;
1649         buffer = kmap_atomic(page);
1650
1651         len = sctx->sectorsize;
1652         index = 0;
1653         for (;;) {
1654                 u64 l = min_t(u64, len, PAGE_SIZE);
1655
1656                 crc = btrfs_csum_data(root, buffer, crc, l);
1657                 kunmap_atomic(buffer);
1658                 len -= l;
1659                 if (len == 0)
1660                         break;
1661                 index++;
1662                 BUG_ON(index >= sblock->page_count);
1663                 BUG_ON(!sblock->pagev[index]->page);
1664                 page = sblock->pagev[index]->page;
1665                 buffer = kmap_atomic(page);
1666         }
1667
1668         btrfs_csum_final(crc, csum);
1669         if (memcmp(csum, on_disk_csum, sctx->csum_size))
1670                 fail = 1;
1671
1672         return fail;
1673 }
1674
1675 static int scrub_checksum_tree_block(struct scrub_block *sblock)
1676 {
1677         struct scrub_ctx *sctx = sblock->sctx;
1678         struct btrfs_header *h;
1679         struct btrfs_root *root = sctx->dev_root;
1680         struct btrfs_fs_info *fs_info = root->fs_info;
1681         u8 calculated_csum[BTRFS_CSUM_SIZE];
1682         u8 on_disk_csum[BTRFS_CSUM_SIZE];
1683         struct page *page;
1684         void *mapped_buffer;
1685         u64 mapped_size;
1686         void *p;
1687         u32 crc = ~(u32)0;
1688         int fail = 0;
1689         int crc_fail = 0;
1690         u64 len;
1691         int index;
1692
1693         BUG_ON(sblock->page_count < 1);
1694         page = sblock->pagev[0]->page;
1695         mapped_buffer = kmap_atomic(page);
1696         h = (struct btrfs_header *)mapped_buffer;
1697         memcpy(on_disk_csum, h->csum, sctx->csum_size);
1698
1699         /*
1700          * we don't use the getter functions here, as we
1701          * a) don't have an extent buffer and
1702          * b) the page is already kmapped
1703          */
1704
1705         if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr))
1706                 ++fail;
1707
1708         if (sblock->pagev[0]->generation != le64_to_cpu(h->generation))
1709                 ++fail;
1710
1711         if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
1712                 ++fail;
1713
1714         if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1715                    BTRFS_UUID_SIZE))
1716                 ++fail;
1717
1718         WARN_ON(sctx->nodesize != sctx->leafsize);
1719         len = sctx->nodesize - BTRFS_CSUM_SIZE;
1720         mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1721         p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1722         index = 0;
1723         for (;;) {
1724                 u64 l = min_t(u64, len, mapped_size);
1725
1726                 crc = btrfs_csum_data(root, p, crc, l);
1727                 kunmap_atomic(mapped_buffer);
1728                 len -= l;
1729                 if (len == 0)
1730                         break;
1731                 index++;
1732                 BUG_ON(index >= sblock->page_count);
1733                 BUG_ON(!sblock->pagev[index]->page);
1734                 page = sblock->pagev[index]->page;
1735                 mapped_buffer = kmap_atomic(page);
1736                 mapped_size = PAGE_SIZE;
1737                 p = mapped_buffer;
1738         }
1739
1740         btrfs_csum_final(crc, calculated_csum);
1741         if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1742                 ++crc_fail;
1743
1744         return fail || crc_fail;
1745 }
1746
1747 static int scrub_checksum_super(struct scrub_block *sblock)
1748 {
1749         struct btrfs_super_block *s;
1750         struct scrub_ctx *sctx = sblock->sctx;
1751         struct btrfs_root *root = sctx->dev_root;
1752         struct btrfs_fs_info *fs_info = root->fs_info;
1753         u8 calculated_csum[BTRFS_CSUM_SIZE];
1754         u8 on_disk_csum[BTRFS_CSUM_SIZE];
1755         struct page *page;
1756         void *mapped_buffer;
1757         u64 mapped_size;
1758         void *p;
1759         u32 crc = ~(u32)0;
1760         int fail_gen = 0;
1761         int fail_cor = 0;
1762         u64 len;
1763         int index;
1764
1765         BUG_ON(sblock->page_count < 1);
1766         page = sblock->pagev[0]->page;
1767         mapped_buffer = kmap_atomic(page);
1768         s = (struct btrfs_super_block *)mapped_buffer;
1769         memcpy(on_disk_csum, s->csum, sctx->csum_size);
1770
1771         if (sblock->pagev[0]->logical != le64_to_cpu(s->bytenr))
1772                 ++fail_cor;
1773
1774         if (sblock->pagev[0]->generation != le64_to_cpu(s->generation))
1775                 ++fail_gen;
1776
1777         if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
1778                 ++fail_cor;
1779
1780         len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
1781         mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1782         p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1783         index = 0;
1784         for (;;) {
1785                 u64 l = min_t(u64, len, mapped_size);
1786
1787                 crc = btrfs_csum_data(root, p, crc, l);
1788                 kunmap_atomic(mapped_buffer);
1789                 len -= l;
1790                 if (len == 0)
1791                         break;
1792                 index++;
1793                 BUG_ON(index >= sblock->page_count);
1794                 BUG_ON(!sblock->pagev[index]->page);
1795                 page = sblock->pagev[index]->page;
1796                 mapped_buffer = kmap_atomic(page);
1797                 mapped_size = PAGE_SIZE;
1798                 p = mapped_buffer;
1799         }
1800
1801         btrfs_csum_final(crc, calculated_csum);
1802         if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1803                 ++fail_cor;
1804
1805         if (fail_cor + fail_gen) {
1806                 /*
1807                  * if we find an error in a super block, we just report it.
1808                  * They will get written with the next transaction commit
1809                  * anyway
1810                  */
1811                 spin_lock(&sctx->stat_lock);
1812                 ++sctx->stat.super_errors;
1813                 spin_unlock(&sctx->stat_lock);
1814                 if (fail_cor)
1815                         btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
1816                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1817                 else
1818                         btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
1819                                 BTRFS_DEV_STAT_GENERATION_ERRS);
1820         }
1821
1822         return fail_cor + fail_gen;
1823 }
1824
1825 static void scrub_block_get(struct scrub_block *sblock)
1826 {
1827         atomic_inc(&sblock->ref_count);
1828 }
1829
1830 static void scrub_block_put(struct scrub_block *sblock)
1831 {
1832         if (atomic_dec_and_test(&sblock->ref_count)) {
1833                 int i;
1834
1835                 for (i = 0; i < sblock->page_count; i++)
1836                         scrub_page_put(sblock->pagev[i]);
1837                 kfree(sblock);
1838         }
1839 }
1840
1841 static void scrub_page_get(struct scrub_page *spage)
1842 {
1843         atomic_inc(&spage->ref_count);
1844 }
1845
1846 static void scrub_page_put(struct scrub_page *spage)
1847 {
1848         if (atomic_dec_and_test(&spage->ref_count)) {
1849                 if (spage->page)
1850                         __free_page(spage->page);
1851                 kfree(spage);
1852         }
1853 }
1854
1855 static void scrub_submit(struct scrub_ctx *sctx)
1856 {
1857         struct scrub_bio *sbio;
1858
1859         if (sctx->curr == -1)
1860                 return;
1861
1862         sbio = sctx->bios[sctx->curr];
1863         sctx->curr = -1;
1864         scrub_pending_bio_inc(sctx);
1865
1866         if (!sbio->bio->bi_bdev) {
1867                 /*
1868                  * this case should not happen. If btrfs_map_block() is
1869                  * wrong, it could happen for dev-replace operations on
1870                  * missing devices when no mirrors are available, but in
1871                  * this case it should already fail the mount.
1872                  * This case is handled correctly (but _very_ slowly).
1873                  */
1874                 printk_ratelimited(KERN_WARNING
1875                         "btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n");
1876                 bio_endio(sbio->bio, -EIO);
1877         } else {
1878                 btrfsic_submit_bio(READ, sbio->bio);
1879         }
1880 }
1881
1882 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
1883                                     struct scrub_page *spage)
1884 {
1885         struct scrub_block *sblock = spage->sblock;
1886         struct scrub_bio *sbio;
1887         int ret;
1888
1889 again:
1890         /*
1891          * grab a fresh bio or wait for one to become available
1892          */
1893         while (sctx->curr == -1) {
1894                 spin_lock(&sctx->list_lock);
1895                 sctx->curr = sctx->first_free;
1896                 if (sctx->curr != -1) {
1897                         sctx->first_free = sctx->bios[sctx->curr]->next_free;
1898                         sctx->bios[sctx->curr]->next_free = -1;
1899                         sctx->bios[sctx->curr]->page_count = 0;
1900                         spin_unlock(&sctx->list_lock);
1901                 } else {
1902                         spin_unlock(&sctx->list_lock);
1903                         wait_event(sctx->list_wait, sctx->first_free != -1);
1904                 }
1905         }
1906         sbio = sctx->bios[sctx->curr];
1907         if (sbio->page_count == 0) {
1908                 struct bio *bio;
1909
1910                 sbio->physical = spage->physical;
1911                 sbio->logical = spage->logical;
1912                 sbio->dev = spage->dev;
1913                 bio = sbio->bio;
1914                 if (!bio) {
1915                         bio = bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
1916                         if (!bio)
1917                                 return -ENOMEM;
1918                         sbio->bio = bio;
1919                 }
1920
1921                 bio->bi_private = sbio;
1922                 bio->bi_end_io = scrub_bio_end_io;
1923                 bio->bi_bdev = sbio->dev->bdev;
1924                 bio->bi_sector = sbio->physical >> 9;
1925                 sbio->err = 0;
1926         } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1927                    spage->physical ||
1928                    sbio->logical + sbio->page_count * PAGE_SIZE !=
1929                    spage->logical ||
1930                    sbio->dev != spage->dev) {
1931                 scrub_submit(sctx);
1932                 goto again;
1933         }
1934
1935         sbio->pagev[sbio->page_count] = spage;
1936         ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1937         if (ret != PAGE_SIZE) {
1938                 if (sbio->page_count < 1) {
1939                         bio_put(sbio->bio);
1940                         sbio->bio = NULL;
1941                         return -EIO;
1942                 }
1943                 scrub_submit(sctx);
1944                 goto again;
1945         }
1946
1947         scrub_block_get(sblock); /* one for the page added to the bio */
1948         atomic_inc(&sblock->outstanding_pages);
1949         sbio->page_count++;
1950         if (sbio->page_count == sctx->pages_per_rd_bio)
1951                 scrub_submit(sctx);
1952
1953         return 0;
1954 }
1955
1956 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
1957                        u64 physical, struct btrfs_device *dev, u64 flags,
1958                        u64 gen, int mirror_num, u8 *csum, int force,
1959                        u64 physical_for_dev_replace)
1960 {
1961         struct scrub_block *sblock;
1962         int index;
1963
1964         sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
1965         if (!sblock) {
1966                 spin_lock(&sctx->stat_lock);
1967                 sctx->stat.malloc_errors++;
1968                 spin_unlock(&sctx->stat_lock);
1969                 return -ENOMEM;
1970         }
1971
1972         /* one ref inside this function, plus one for each page added to
1973          * a bio later on */
1974         atomic_set(&sblock->ref_count, 1);
1975         sblock->sctx = sctx;
1976         sblock->no_io_error_seen = 1;
1977
1978         for (index = 0; len > 0; index++) {
1979                 struct scrub_page *spage;
1980                 u64 l = min_t(u64, len, PAGE_SIZE);
1981
1982                 spage = kzalloc(sizeof(*spage), GFP_NOFS);
1983                 if (!spage) {
1984 leave_nomem:
1985                         spin_lock(&sctx->stat_lock);
1986                         sctx->stat.malloc_errors++;
1987                         spin_unlock(&sctx->stat_lock);
1988                         scrub_block_put(sblock);
1989                         return -ENOMEM;
1990                 }
1991                 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
1992                 scrub_page_get(spage);
1993                 sblock->pagev[index] = spage;
1994                 spage->sblock = sblock;
1995                 spage->dev = dev;
1996                 spage->flags = flags;
1997                 spage->generation = gen;
1998                 spage->logical = logical;
1999                 spage->physical = physical;
2000                 spage->physical_for_dev_replace = physical_for_dev_replace;
2001                 spage->mirror_num = mirror_num;
2002                 if (csum) {
2003                         spage->have_csum = 1;
2004                         memcpy(spage->csum, csum, sctx->csum_size);
2005                 } else {
2006                         spage->have_csum = 0;
2007                 }
2008                 sblock->page_count++;
2009                 spage->page = alloc_page(GFP_NOFS);
2010                 if (!spage->page)
2011                         goto leave_nomem;
2012                 len -= l;
2013                 logical += l;
2014                 physical += l;
2015                 physical_for_dev_replace += l;
2016         }
2017
2018         WARN_ON(sblock->page_count == 0);
2019         for (index = 0; index < sblock->page_count; index++) {
2020                 struct scrub_page *spage = sblock->pagev[index];
2021                 int ret;
2022
2023                 ret = scrub_add_page_to_rd_bio(sctx, spage);
2024                 if (ret) {
2025                         scrub_block_put(sblock);
2026                         return ret;
2027                 }
2028         }
2029
2030         if (force)
2031                 scrub_submit(sctx);
2032
2033         /* last one frees, either here or in bio completion for last page */
2034         scrub_block_put(sblock);
2035         return 0;
2036 }
2037
2038 static void scrub_bio_end_io(struct bio *bio, int err)
2039 {
2040         struct scrub_bio *sbio = bio->bi_private;
2041         struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
2042
2043         sbio->err = err;
2044         sbio->bio = bio;
2045
2046         btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
2047 }
2048
2049 static void scrub_bio_end_io_worker(struct btrfs_work *work)
2050 {
2051         struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2052         struct scrub_ctx *sctx = sbio->sctx;
2053         int i;
2054
2055         BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
2056         if (sbio->err) {
2057                 for (i = 0; i < sbio->page_count; i++) {
2058                         struct scrub_page *spage = sbio->pagev[i];
2059
2060                         spage->io_error = 1;
2061                         spage->sblock->no_io_error_seen = 0;
2062                 }
2063         }
2064
2065         /* now complete the scrub_block items that have all pages completed */
2066         for (i = 0; i < sbio->page_count; i++) {
2067                 struct scrub_page *spage = sbio->pagev[i];
2068                 struct scrub_block *sblock = spage->sblock;
2069
2070                 if (atomic_dec_and_test(&sblock->outstanding_pages))
2071                         scrub_block_complete(sblock);
2072                 scrub_block_put(sblock);
2073         }
2074
2075         bio_put(sbio->bio);
2076         sbio->bio = NULL;
2077         spin_lock(&sctx->list_lock);
2078         sbio->next_free = sctx->first_free;
2079         sctx->first_free = sbio->index;
2080         spin_unlock(&sctx->list_lock);
2081
2082         if (sctx->is_dev_replace &&
2083             atomic_read(&sctx->wr_ctx.flush_all_writes)) {
2084                 mutex_lock(&sctx->wr_ctx.wr_lock);
2085                 scrub_wr_submit(sctx);
2086                 mutex_unlock(&sctx->wr_ctx.wr_lock);
2087         }
2088
2089         scrub_pending_bio_dec(sctx);
2090 }
2091
2092 static void scrub_block_complete(struct scrub_block *sblock)
2093 {
2094         if (!sblock->no_io_error_seen) {
2095                 scrub_handle_errored_block(sblock);
2096         } else {
2097                 /*
2098                  * if has checksum error, write via repair mechanism in
2099                  * dev replace case, otherwise write here in dev replace
2100                  * case.
2101                  */
2102                 if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace)
2103                         scrub_write_block_to_dev_replace(sblock);
2104         }
2105 }
2106
2107 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
2108                            u8 *csum)
2109 {
2110         struct btrfs_ordered_sum *sum = NULL;
2111         int ret = 0;
2112         unsigned long i;
2113         unsigned long num_sectors;
2114
2115         while (!list_empty(&sctx->csum_list)) {
2116                 sum = list_first_entry(&sctx->csum_list,
2117                                        struct btrfs_ordered_sum, list);
2118                 if (sum->bytenr > logical)
2119                         return 0;
2120                 if (sum->bytenr + sum->len > logical)
2121                         break;
2122
2123                 ++sctx->stat.csum_discards;
2124                 list_del(&sum->list);
2125                 kfree(sum);
2126                 sum = NULL;
2127         }
2128         if (!sum)
2129                 return 0;
2130
2131         num_sectors = sum->len / sctx->sectorsize;
2132         for (i = 0; i < num_sectors; ++i) {
2133                 if (sum->sums[i].bytenr == logical) {
2134                         memcpy(csum, &sum->sums[i].sum, sctx->csum_size);
2135                         ret = 1;
2136                         break;
2137                 }
2138         }
2139         if (ret && i == num_sectors - 1) {
2140                 list_del(&sum->list);
2141                 kfree(sum);
2142         }
2143         return ret;
2144 }
2145
2146 /* scrub extent tries to collect up to 64 kB for each bio */
2147 static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
2148                         u64 physical, struct btrfs_device *dev, u64 flags,
2149                         u64 gen, int mirror_num, u64 physical_for_dev_replace)
2150 {
2151         int ret;
2152         u8 csum[BTRFS_CSUM_SIZE];
2153         u32 blocksize;
2154
2155         if (flags & BTRFS_EXTENT_FLAG_DATA) {
2156                 blocksize = sctx->sectorsize;
2157                 spin_lock(&sctx->stat_lock);
2158                 sctx->stat.data_extents_scrubbed++;
2159                 sctx->stat.data_bytes_scrubbed += len;
2160                 spin_unlock(&sctx->stat_lock);
2161         } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2162                 WARN_ON(sctx->nodesize != sctx->leafsize);
2163                 blocksize = sctx->nodesize;
2164                 spin_lock(&sctx->stat_lock);
2165                 sctx->stat.tree_extents_scrubbed++;
2166                 sctx->stat.tree_bytes_scrubbed += len;
2167                 spin_unlock(&sctx->stat_lock);
2168         } else {
2169                 blocksize = sctx->sectorsize;
2170                 WARN_ON(1);
2171         }
2172
2173         while (len) {
2174                 u64 l = min_t(u64, len, blocksize);
2175                 int have_csum = 0;
2176
2177                 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2178                         /* push csums to sbio */
2179                         have_csum = scrub_find_csum(sctx, logical, l, csum);
2180                         if (have_csum == 0)
2181                                 ++sctx->stat.no_csum;
2182                         if (sctx->is_dev_replace && !have_csum) {
2183                                 ret = copy_nocow_pages(sctx, logical, l,
2184                                                        mirror_num,
2185                                                       physical_for_dev_replace);
2186                                 goto behind_scrub_pages;
2187                         }
2188                 }
2189                 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
2190                                   mirror_num, have_csum ? csum : NULL, 0,
2191                                   physical_for_dev_replace);
2192 behind_scrub_pages:
2193                 if (ret)
2194                         return ret;
2195                 len -= l;
2196                 logical += l;
2197                 physical += l;
2198                 physical_for_dev_replace += l;
2199         }
2200         return 0;
2201 }
2202
2203 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2204                                            struct map_lookup *map,
2205                                            struct btrfs_device *scrub_dev,
2206                                            int num, u64 base, u64 length,
2207                                            int is_dev_replace)
2208 {
2209         struct btrfs_path *path;
2210         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
2211         struct btrfs_root *root = fs_info->extent_root;
2212         struct btrfs_root *csum_root = fs_info->csum_root;
2213         struct btrfs_extent_item *extent;
2214         struct blk_plug plug;
2215         u64 flags;
2216         int ret;
2217         int slot;
2218         int i;
2219         u64 nstripes;
2220         struct extent_buffer *l;
2221         struct btrfs_key key;
2222         u64 physical;
2223         u64 logical;
2224         u64 generation;
2225         int mirror_num;
2226         struct reada_control *reada1;
2227         struct reada_control *reada2;
2228         struct btrfs_key key_start;
2229         struct btrfs_key key_end;
2230         u64 increment = map->stripe_len;
2231         u64 offset;
2232         u64 extent_logical;
2233         u64 extent_physical;
2234         u64 extent_len;
2235         struct btrfs_device *extent_dev;
2236         int extent_mirror_num;
2237
2238         nstripes = length;
2239         offset = 0;
2240         do_div(nstripes, map->stripe_len);
2241         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
2242                 offset = map->stripe_len * num;
2243                 increment = map->stripe_len * map->num_stripes;
2244                 mirror_num = 1;
2245         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2246                 int factor = map->num_stripes / map->sub_stripes;
2247                 offset = map->stripe_len * (num / map->sub_stripes);
2248                 increment = map->stripe_len * factor;
2249                 mirror_num = num % map->sub_stripes + 1;
2250         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
2251                 increment = map->stripe_len;
2252                 mirror_num = num % map->num_stripes + 1;
2253         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2254                 increment = map->stripe_len;
2255                 mirror_num = num % map->num_stripes + 1;
2256         } else {
2257                 increment = map->stripe_len;
2258                 mirror_num = 1;
2259         }
2260
2261         path = btrfs_alloc_path();
2262         if (!path)
2263                 return -ENOMEM;
2264
2265         /*
2266          * work on commit root. The related disk blocks are static as
2267          * long as COW is applied. This means, it is save to rewrite
2268          * them to repair disk errors without any race conditions
2269          */
2270         path->search_commit_root = 1;
2271         path->skip_locking = 1;
2272
2273         /*
2274          * trigger the readahead for extent tree csum tree and wait for
2275          * completion. During readahead, the scrub is officially paused
2276          * to not hold off transaction commits
2277          */
2278         logical = base + offset;
2279
2280         wait_event(sctx->list_wait,
2281                    atomic_read(&sctx->bios_in_flight) == 0);
2282         atomic_inc(&fs_info->scrubs_paused);
2283         wake_up(&fs_info->scrub_pause_wait);
2284
2285         /* FIXME it might be better to start readahead at commit root */
2286         key_start.objectid = logical;
2287         key_start.type = BTRFS_EXTENT_ITEM_KEY;
2288         key_start.offset = (u64)0;
2289         key_end.objectid = base + offset + nstripes * increment;
2290         key_end.type = BTRFS_EXTENT_ITEM_KEY;
2291         key_end.offset = (u64)0;
2292         reada1 = btrfs_reada_add(root, &key_start, &key_end);
2293
2294         key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
2295         key_start.type = BTRFS_EXTENT_CSUM_KEY;
2296         key_start.offset = logical;
2297         key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
2298         key_end.type = BTRFS_EXTENT_CSUM_KEY;
2299         key_end.offset = base + offset + nstripes * increment;
2300         reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
2301
2302         if (!IS_ERR(reada1))
2303                 btrfs_reada_wait(reada1);
2304         if (!IS_ERR(reada2))
2305                 btrfs_reada_wait(reada2);
2306
2307         mutex_lock(&fs_info->scrub_lock);
2308         while (atomic_read(&fs_info->scrub_pause_req)) {
2309                 mutex_unlock(&fs_info->scrub_lock);
2310                 wait_event(fs_info->scrub_pause_wait,
2311                    atomic_read(&fs_info->scrub_pause_req) == 0);
2312                 mutex_lock(&fs_info->scrub_lock);
2313         }
2314         atomic_dec(&fs_info->scrubs_paused);
2315         mutex_unlock(&fs_info->scrub_lock);
2316         wake_up(&fs_info->scrub_pause_wait);
2317
2318         /*
2319          * collect all data csums for the stripe to avoid seeking during
2320          * the scrub. This might currently (crc32) end up to be about 1MB
2321          */
2322         blk_start_plug(&plug);
2323
2324         /*
2325          * now find all extents for each stripe and scrub them
2326          */
2327         logical = base + offset;
2328         physical = map->stripes[num].physical;
2329         ret = 0;
2330         for (i = 0; i < nstripes; ++i) {
2331                 /*
2332                  * canceled?
2333                  */
2334                 if (atomic_read(&fs_info->scrub_cancel_req) ||
2335                     atomic_read(&sctx->cancel_req)) {
2336                         ret = -ECANCELED;
2337                         goto out;
2338                 }
2339                 /*
2340                  * check to see if we have to pause
2341                  */
2342                 if (atomic_read(&fs_info->scrub_pause_req)) {
2343                         /* push queued extents */
2344                         atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
2345                         scrub_submit(sctx);
2346                         mutex_lock(&sctx->wr_ctx.wr_lock);
2347                         scrub_wr_submit(sctx);
2348                         mutex_unlock(&sctx->wr_ctx.wr_lock);
2349                         wait_event(sctx->list_wait,
2350                                    atomic_read(&sctx->bios_in_flight) == 0);
2351                         atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
2352                         atomic_inc(&fs_info->scrubs_paused);
2353                         wake_up(&fs_info->scrub_pause_wait);
2354                         mutex_lock(&fs_info->scrub_lock);
2355                         while (atomic_read(&fs_info->scrub_pause_req)) {
2356                                 mutex_unlock(&fs_info->scrub_lock);
2357                                 wait_event(fs_info->scrub_pause_wait,
2358                                    atomic_read(&fs_info->scrub_pause_req) == 0);
2359                                 mutex_lock(&fs_info->scrub_lock);
2360                         }
2361                         atomic_dec(&fs_info->scrubs_paused);
2362                         mutex_unlock(&fs_info->scrub_lock);
2363                         wake_up(&fs_info->scrub_pause_wait);
2364                 }
2365
2366                 ret = btrfs_lookup_csums_range(csum_root, logical,
2367                                                logical + map->stripe_len - 1,
2368                                                &sctx->csum_list, 1);
2369                 if (ret)
2370                         goto out;
2371
2372                 key.objectid = logical;
2373                 key.type = BTRFS_EXTENT_ITEM_KEY;
2374                 key.offset = (u64)0;
2375
2376                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2377                 if (ret < 0)
2378                         goto out;
2379                 if (ret > 0) {
2380                         ret = btrfs_previous_item(root, path, 0,
2381                                                   BTRFS_EXTENT_ITEM_KEY);
2382                         if (ret < 0)
2383                                 goto out;
2384                         if (ret > 0) {
2385                                 /* there's no smaller item, so stick with the
2386                                  * larger one */
2387                                 btrfs_release_path(path);
2388                                 ret = btrfs_search_slot(NULL, root, &key,
2389                                                         path, 0, 0);
2390                                 if (ret < 0)
2391                                         goto out;
2392                         }
2393                 }
2394
2395                 while (1) {
2396                         l = path->nodes[0];
2397                         slot = path->slots[0];
2398                         if (slot >= btrfs_header_nritems(l)) {
2399                                 ret = btrfs_next_leaf(root, path);
2400                                 if (ret == 0)
2401                                         continue;
2402                                 if (ret < 0)
2403                                         goto out;
2404
2405                                 break;
2406                         }
2407                         btrfs_item_key_to_cpu(l, &key, slot);
2408
2409                         if (key.objectid + key.offset <= logical)
2410                                 goto next;
2411
2412                         if (key.objectid >= logical + map->stripe_len)
2413                                 break;
2414
2415                         if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY)
2416                                 goto next;
2417
2418                         extent = btrfs_item_ptr(l, slot,
2419                                                 struct btrfs_extent_item);
2420                         flags = btrfs_extent_flags(l, extent);
2421                         generation = btrfs_extent_generation(l, extent);
2422
2423                         if (key.objectid < logical &&
2424                             (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
2425                                 printk(KERN_ERR
2426                                        "btrfs scrub: tree block %llu spanning "
2427                                        "stripes, ignored. logical=%llu\n",
2428                                        (unsigned long long)key.objectid,
2429                                        (unsigned long long)logical);
2430                                 goto next;
2431                         }
2432
2433                         /*
2434                          * trim extent to this stripe
2435                          */
2436                         if (key.objectid < logical) {
2437                                 key.offset -= logical - key.objectid;
2438                                 key.objectid = logical;
2439                         }
2440                         if (key.objectid + key.offset >
2441                             logical + map->stripe_len) {
2442                                 key.offset = logical + map->stripe_len -
2443                                              key.objectid;
2444                         }
2445
2446                         extent_logical = key.objectid;
2447                         extent_physical = key.objectid - logical + physical;
2448                         extent_len = key.offset;
2449                         extent_dev = scrub_dev;
2450                         extent_mirror_num = mirror_num;
2451                         if (is_dev_replace)
2452                                 scrub_remap_extent(fs_info, extent_logical,
2453                                                    extent_len, &extent_physical,
2454                                                    &extent_dev,
2455                                                    &extent_mirror_num);
2456                         ret = scrub_extent(sctx, extent_logical, extent_len,
2457                                            extent_physical, extent_dev, flags,
2458                                            generation, extent_mirror_num,
2459                                            key.objectid - logical + physical);
2460                         if (ret)
2461                                 goto out;
2462
2463 next:
2464                         path->slots[0]++;
2465                 }
2466                 btrfs_release_path(path);
2467                 logical += increment;
2468                 physical += map->stripe_len;
2469                 spin_lock(&sctx->stat_lock);
2470                 sctx->stat.last_physical = physical;
2471                 spin_unlock(&sctx->stat_lock);
2472         }
2473 out:
2474         /* push queued extents */
2475         scrub_submit(sctx);
2476         mutex_lock(&sctx->wr_ctx.wr_lock);
2477         scrub_wr_submit(sctx);
2478         mutex_unlock(&sctx->wr_ctx.wr_lock);
2479
2480         blk_finish_plug(&plug);
2481         btrfs_free_path(path);
2482         return ret < 0 ? ret : 0;
2483 }
2484
2485 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
2486                                           struct btrfs_device *scrub_dev,
2487                                           u64 chunk_tree, u64 chunk_objectid,
2488                                           u64 chunk_offset, u64 length,
2489                                           u64 dev_offset, int is_dev_replace)
2490 {
2491         struct btrfs_mapping_tree *map_tree =
2492                 &sctx->dev_root->fs_info->mapping_tree;
2493         struct map_lookup *map;
2494         struct extent_map *em;
2495         int i;
2496         int ret = 0;
2497
2498         read_lock(&map_tree->map_tree.lock);
2499         em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
2500         read_unlock(&map_tree->map_tree.lock);
2501
2502         if (!em)
2503                 return -EINVAL;
2504
2505         map = (struct map_lookup *)em->bdev;
2506         if (em->start != chunk_offset)
2507                 goto out;
2508
2509         if (em->len < length)
2510                 goto out;
2511
2512         for (i = 0; i < map->num_stripes; ++i) {
2513                 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
2514                     map->stripes[i].physical == dev_offset) {
2515                         ret = scrub_stripe(sctx, map, scrub_dev, i,
2516                                            chunk_offset, length,
2517                                            is_dev_replace);
2518                         if (ret)
2519                                 goto out;
2520                 }
2521         }
2522 out:
2523         free_extent_map(em);
2524
2525         return ret;
2526 }
2527
2528 static noinline_for_stack
2529 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2530                            struct btrfs_device *scrub_dev, u64 start, u64 end,
2531                            int is_dev_replace)
2532 {
2533         struct btrfs_dev_extent *dev_extent = NULL;
2534         struct btrfs_path *path;
2535         struct btrfs_root *root = sctx->dev_root;
2536         struct btrfs_fs_info *fs_info = root->fs_info;
2537         u64 length;
2538         u64 chunk_tree;
2539         u64 chunk_objectid;
2540         u64 chunk_offset;
2541         int ret;
2542         int slot;
2543         struct extent_buffer *l;
2544         struct btrfs_key key;
2545         struct btrfs_key found_key;
2546         struct btrfs_block_group_cache *cache;
2547         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
2548
2549         path = btrfs_alloc_path();
2550         if (!path)
2551                 return -ENOMEM;
2552
2553         path->reada = 2;
2554         path->search_commit_root = 1;
2555         path->skip_locking = 1;
2556
2557         key.objectid = scrub_dev->devid;
2558         key.offset = 0ull;
2559         key.type = BTRFS_DEV_EXTENT_KEY;
2560
2561         while (1) {
2562                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2563                 if (ret < 0)
2564                         break;
2565                 if (ret > 0) {
2566                         if (path->slots[0] >=
2567                             btrfs_header_nritems(path->nodes[0])) {
2568                                 ret = btrfs_next_leaf(root, path);
2569                                 if (ret)
2570                                         break;
2571                         }
2572                 }
2573
2574                 l = path->nodes[0];
2575                 slot = path->slots[0];
2576
2577                 btrfs_item_key_to_cpu(l, &found_key, slot);
2578
2579                 if (found_key.objectid != scrub_dev->devid)
2580                         break;
2581
2582                 if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
2583                         break;
2584
2585                 if (found_key.offset >= end)
2586                         break;
2587
2588                 if (found_key.offset < key.offset)
2589                         break;
2590
2591                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
2592                 length = btrfs_dev_extent_length(l, dev_extent);
2593
2594                 if (found_key.offset + length <= start) {
2595                         key.offset = found_key.offset + length;
2596                         btrfs_release_path(path);
2597                         continue;
2598                 }
2599
2600                 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
2601                 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
2602                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
2603
2604                 /*
2605                  * get a reference on the corresponding block group to prevent
2606                  * the chunk from going away while we scrub it
2607                  */
2608                 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2609                 if (!cache) {
2610                         ret = -ENOENT;
2611                         break;
2612                 }
2613                 dev_replace->cursor_right = found_key.offset + length;
2614                 dev_replace->cursor_left = found_key.offset;
2615                 dev_replace->item_needs_writeback = 1;
2616                 ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
2617                                   chunk_offset, length, found_key.offset,
2618                                   is_dev_replace);
2619
2620                 /*
2621                  * flush, submit all pending read and write bios, afterwards
2622                  * wait for them.
2623                  * Note that in the dev replace case, a read request causes
2624                  * write requests that are submitted in the read completion
2625                  * worker. Therefore in the current situation, it is required
2626                  * that all write requests are flushed, so that all read and
2627                  * write requests are really completed when bios_in_flight
2628                  * changes to 0.
2629                  */
2630                 atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
2631                 scrub_submit(sctx);
2632                 mutex_lock(&sctx->wr_ctx.wr_lock);
2633                 scrub_wr_submit(sctx);
2634                 mutex_unlock(&sctx->wr_ctx.wr_lock);
2635
2636                 wait_event(sctx->list_wait,
2637                            atomic_read(&sctx->bios_in_flight) == 0);
2638                 atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
2639                 atomic_inc(&fs_info->scrubs_paused);
2640                 wake_up(&fs_info->scrub_pause_wait);
2641                 wait_event(sctx->list_wait,
2642                            atomic_read(&sctx->workers_pending) == 0);
2643
2644                 mutex_lock(&fs_info->scrub_lock);
2645                 while (atomic_read(&fs_info->scrub_pause_req)) {
2646                         mutex_unlock(&fs_info->scrub_lock);
2647                         wait_event(fs_info->scrub_pause_wait,
2648                            atomic_read(&fs_info->scrub_pause_req) == 0);
2649                         mutex_lock(&fs_info->scrub_lock);
2650                 }
2651                 atomic_dec(&fs_info->scrubs_paused);
2652                 mutex_unlock(&fs_info->scrub_lock);
2653                 wake_up(&fs_info->scrub_pause_wait);
2654
2655                 dev_replace->cursor_left = dev_replace->cursor_right;
2656                 dev_replace->item_needs_writeback = 1;
2657                 btrfs_put_block_group(cache);
2658                 if (ret)
2659                         break;
2660                 if (atomic64_read(&dev_replace->num_write_errors) > 0) {
2661                         ret = -EIO;
2662                         break;
2663                 }
2664                 if (sctx->stat.malloc_errors > 0) {
2665                         ret = -ENOMEM;
2666                         break;
2667                 }
2668
2669                 key.offset = found_key.offset + length;
2670                 btrfs_release_path(path);
2671         }
2672
2673         btrfs_free_path(path);
2674
2675         /*
2676          * ret can still be 1 from search_slot or next_leaf,
2677          * that's not an error
2678          */
2679         return ret < 0 ? ret : 0;
2680 }
2681
2682 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
2683                                            struct btrfs_device *scrub_dev)
2684 {
2685         int     i;
2686         u64     bytenr;
2687         u64     gen;
2688         int     ret;
2689         struct btrfs_root *root = sctx->dev_root;
2690
2691         if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
2692                 return -EIO;
2693
2694         gen = root->fs_info->last_trans_committed;
2695
2696         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
2697                 bytenr = btrfs_sb_offset(i);
2698                 if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes)
2699                         break;
2700
2701                 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
2702                                   scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
2703                                   NULL, 1, bytenr);
2704                 if (ret)
2705                         return ret;
2706         }
2707         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
2708
2709         return 0;
2710 }
2711
2712 /*
2713  * get a reference count on fs_info->scrub_workers. start worker if necessary
2714  */
2715 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
2716                                                 int is_dev_replace)
2717 {
2718         int ret = 0;
2719
2720         mutex_lock(&fs_info->scrub_lock);
2721         if (fs_info->scrub_workers_refcnt == 0) {
2722                 if (is_dev_replace)
2723                         btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1,
2724                                         &fs_info->generic_worker);
2725                 else
2726                         btrfs_init_workers(&fs_info->scrub_workers, "scrub",
2727                                         fs_info->thread_pool_size,
2728                                         &fs_info->generic_worker);
2729                 fs_info->scrub_workers.idle_thresh = 4;
2730                 ret = btrfs_start_workers(&fs_info->scrub_workers);
2731                 if (ret)
2732                         goto out;
2733                 btrfs_init_workers(&fs_info->scrub_wr_completion_workers,
2734                                    "scrubwrc",
2735                                    fs_info->thread_pool_size,
2736                                    &fs_info->generic_worker);
2737                 fs_info->scrub_wr_completion_workers.idle_thresh = 2;
2738                 ret = btrfs_start_workers(
2739                                 &fs_info->scrub_wr_completion_workers);
2740                 if (ret)
2741                         goto out;
2742                 btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1,
2743                                    &fs_info->generic_worker);
2744                 ret = btrfs_start_workers(&fs_info->scrub_nocow_workers);
2745                 if (ret)
2746                         goto out;
2747         }
2748         ++fs_info->scrub_workers_refcnt;
2749 out:
2750         mutex_unlock(&fs_info->scrub_lock);
2751
2752         return ret;
2753 }
2754
2755 static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
2756 {
2757         mutex_lock(&fs_info->scrub_lock);
2758         if (--fs_info->scrub_workers_refcnt == 0) {
2759                 btrfs_stop_workers(&fs_info->scrub_workers);
2760                 btrfs_stop_workers(&fs_info->scrub_wr_completion_workers);
2761                 btrfs_stop_workers(&fs_info->scrub_nocow_workers);
2762         }
2763         WARN_ON(fs_info->scrub_workers_refcnt < 0);
2764         mutex_unlock(&fs_info->scrub_lock);
2765 }
2766
2767 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
2768                     u64 end, struct btrfs_scrub_progress *progress,
2769                     int readonly, int is_dev_replace)
2770 {
2771         struct scrub_ctx *sctx;
2772         int ret;
2773         struct btrfs_device *dev;
2774
2775         if (btrfs_fs_closing(fs_info))
2776                 return -EINVAL;
2777
2778         /*
2779          * check some assumptions
2780          */
2781         if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
2782                 printk(KERN_ERR
2783                        "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n",
2784                        fs_info->chunk_root->nodesize,
2785                        fs_info->chunk_root->leafsize);
2786                 return -EINVAL;
2787         }
2788
2789         if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
2790                 /*
2791                  * in this case scrub is unable to calculate the checksum
2792                  * the way scrub is implemented. Do not handle this
2793                  * situation at all because it won't ever happen.
2794                  */
2795                 printk(KERN_ERR
2796                        "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n",
2797                        fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
2798                 return -EINVAL;
2799         }
2800
2801         if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
2802                 /* not supported for data w/o checksums */
2803                 printk(KERN_ERR
2804                        "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n",
2805                        fs_info->chunk_root->sectorsize,
2806                        (unsigned long long)PAGE_SIZE);
2807                 return -EINVAL;
2808         }
2809
2810         if (fs_info->chunk_root->nodesize >
2811             PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
2812             fs_info->chunk_root->sectorsize >
2813             PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
2814                 /*
2815                  * would exhaust the array bounds of pagev member in
2816                  * struct scrub_block
2817                  */
2818                 pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n",
2819                        fs_info->chunk_root->nodesize,
2820                        SCRUB_MAX_PAGES_PER_BLOCK,
2821                        fs_info->chunk_root->sectorsize,
2822                        SCRUB_MAX_PAGES_PER_BLOCK);
2823                 return -EINVAL;
2824         }
2825
2826         ret = scrub_workers_get(fs_info, is_dev_replace);
2827         if (ret)
2828                 return ret;
2829
2830         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2831         dev = btrfs_find_device(fs_info, devid, NULL, NULL);
2832         if (!dev || (dev->missing && !is_dev_replace)) {
2833                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2834                 scrub_workers_put(fs_info);
2835                 return -ENODEV;
2836         }
2837         mutex_lock(&fs_info->scrub_lock);
2838
2839         if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
2840                 mutex_unlock(&fs_info->scrub_lock);
2841                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2842                 scrub_workers_put(fs_info);
2843                 return -EIO;
2844         }
2845
2846         if (dev->scrub_device) {
2847                 mutex_unlock(&fs_info->scrub_lock);
2848                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2849                 scrub_workers_put(fs_info);
2850                 return -EINPROGRESS;
2851         }
2852         sctx = scrub_setup_ctx(dev, is_dev_replace);
2853         if (IS_ERR(sctx)) {
2854                 mutex_unlock(&fs_info->scrub_lock);
2855                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2856                 scrub_workers_put(fs_info);
2857                 return PTR_ERR(sctx);
2858         }
2859         sctx->readonly = readonly;
2860         dev->scrub_device = sctx;
2861
2862         atomic_inc(&fs_info->scrubs_running);
2863         mutex_unlock(&fs_info->scrub_lock);
2864         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2865
2866         if (!is_dev_replace) {
2867                 down_read(&fs_info->scrub_super_lock);
2868                 ret = scrub_supers(sctx, dev);
2869                 up_read(&fs_info->scrub_super_lock);
2870         }
2871
2872         if (!ret)
2873                 ret = scrub_enumerate_chunks(sctx, dev, start, end,
2874                                              is_dev_replace);
2875
2876         wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
2877         atomic_dec(&fs_info->scrubs_running);
2878         wake_up(&fs_info->scrub_pause_wait);
2879
2880         wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
2881
2882         if (progress)
2883                 memcpy(progress, &sctx->stat, sizeof(*progress));
2884
2885         mutex_lock(&fs_info->scrub_lock);
2886         dev->scrub_device = NULL;
2887         mutex_unlock(&fs_info->scrub_lock);
2888
2889         scrub_free_ctx(sctx);
2890         scrub_workers_put(fs_info);
2891
2892         return ret;
2893 }
2894
2895 void btrfs_scrub_pause(struct btrfs_root *root)
2896 {
2897         struct btrfs_fs_info *fs_info = root->fs_info;
2898
2899         mutex_lock(&fs_info->scrub_lock);
2900         atomic_inc(&fs_info->scrub_pause_req);
2901         while (atomic_read(&fs_info->scrubs_paused) !=
2902                atomic_read(&fs_info->scrubs_running)) {
2903                 mutex_unlock(&fs_info->scrub_lock);
2904                 wait_event(fs_info->scrub_pause_wait,
2905                            atomic_read(&fs_info->scrubs_paused) ==
2906                            atomic_read(&fs_info->scrubs_running));
2907                 mutex_lock(&fs_info->scrub_lock);
2908         }
2909         mutex_unlock(&fs_info->scrub_lock);
2910 }
2911
2912 void btrfs_scrub_continue(struct btrfs_root *root)
2913 {
2914         struct btrfs_fs_info *fs_info = root->fs_info;
2915
2916         atomic_dec(&fs_info->scrub_pause_req);
2917         wake_up(&fs_info->scrub_pause_wait);
2918 }
2919
2920 void btrfs_scrub_pause_super(struct btrfs_root *root)
2921 {
2922         down_write(&root->fs_info->scrub_super_lock);
2923 }
2924
2925 void btrfs_scrub_continue_super(struct btrfs_root *root)
2926 {
2927         up_write(&root->fs_info->scrub_super_lock);
2928 }
2929
2930 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
2931 {
2932         mutex_lock(&fs_info->scrub_lock);
2933         if (!atomic_read(&fs_info->scrubs_running)) {
2934                 mutex_unlock(&fs_info->scrub_lock);
2935                 return -ENOTCONN;
2936         }
2937
2938         atomic_inc(&fs_info->scrub_cancel_req);
2939         while (atomic_read(&fs_info->scrubs_running)) {
2940                 mutex_unlock(&fs_info->scrub_lock);
2941                 wait_event(fs_info->scrub_pause_wait,
2942                            atomic_read(&fs_info->scrubs_running) == 0);
2943                 mutex_lock(&fs_info->scrub_lock);
2944         }
2945         atomic_dec(&fs_info->scrub_cancel_req);
2946         mutex_unlock(&fs_info->scrub_lock);
2947
2948         return 0;
2949 }
2950
2951 int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
2952                            struct btrfs_device *dev)
2953 {
2954         struct scrub_ctx *sctx;
2955
2956         mutex_lock(&fs_info->scrub_lock);
2957         sctx = dev->scrub_device;
2958         if (!sctx) {
2959                 mutex_unlock(&fs_info->scrub_lock);
2960                 return -ENOTCONN;
2961         }
2962         atomic_inc(&sctx->cancel_req);
2963         while (dev->scrub_device) {
2964                 mutex_unlock(&fs_info->scrub_lock);
2965                 wait_event(fs_info->scrub_pause_wait,
2966                            dev->scrub_device == NULL);
2967                 mutex_lock(&fs_info->scrub_lock);
2968         }
2969         mutex_unlock(&fs_info->scrub_lock);
2970
2971         return 0;
2972 }
2973
2974 int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
2975 {
2976         struct btrfs_fs_info *fs_info = root->fs_info;
2977         struct btrfs_device *dev;
2978         int ret;
2979
2980         /*
2981          * we have to hold the device_list_mutex here so the device
2982          * does not go away in cancel_dev. FIXME: find a better solution
2983          */
2984         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2985         dev = btrfs_find_device(fs_info, devid, NULL, NULL);
2986         if (!dev) {
2987                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2988                 return -ENODEV;
2989         }
2990         ret = btrfs_scrub_cancel_dev(fs_info, dev);
2991         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2992
2993         return ret;
2994 }
2995
2996 int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
2997                          struct btrfs_scrub_progress *progress)
2998 {
2999         struct btrfs_device *dev;
3000         struct scrub_ctx *sctx = NULL;
3001
3002         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
3003         dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);
3004         if (dev)
3005                 sctx = dev->scrub_device;
3006         if (sctx)
3007                 memcpy(progress, &sctx->stat, sizeof(*progress));
3008         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
3009
3010         return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
3011 }
3012
3013 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
3014                                u64 extent_logical, u64 extent_len,
3015                                u64 *extent_physical,
3016                                struct btrfs_device **extent_dev,
3017                                int *extent_mirror_num)
3018 {
3019         u64 mapped_length;
3020         struct btrfs_bio *bbio = NULL;
3021         int ret;
3022
3023         mapped_length = extent_len;
3024         ret = btrfs_map_block(fs_info, READ, extent_logical,
3025                               &mapped_length, &bbio, 0);
3026         if (ret || !bbio || mapped_length < extent_len ||
3027             !bbio->stripes[0].dev->bdev) {
3028                 kfree(bbio);
3029                 return;
3030         }
3031
3032         *extent_physical = bbio->stripes[0].physical;
3033         *extent_mirror_num = bbio->mirror_num;
3034         *extent_dev = bbio->stripes[0].dev;
3035         kfree(bbio);
3036 }
3037
3038 static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
3039                               struct scrub_wr_ctx *wr_ctx,
3040                               struct btrfs_fs_info *fs_info,
3041                               struct btrfs_device *dev,
3042                               int is_dev_replace)
3043 {
3044         WARN_ON(wr_ctx->wr_curr_bio != NULL);
3045
3046         mutex_init(&wr_ctx->wr_lock);
3047         wr_ctx->wr_curr_bio = NULL;
3048         if (!is_dev_replace)
3049                 return 0;
3050
3051         WARN_ON(!dev->bdev);
3052         wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
3053                                          bio_get_nr_vecs(dev->bdev));
3054         wr_ctx->tgtdev = dev;
3055         atomic_set(&wr_ctx->flush_all_writes, 0);
3056         return 0;
3057 }
3058
3059 static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
3060 {
3061         mutex_lock(&wr_ctx->wr_lock);
3062         kfree(wr_ctx->wr_curr_bio);
3063         wr_ctx->wr_curr_bio = NULL;
3064         mutex_unlock(&wr_ctx->wr_lock);
3065 }
3066
3067 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
3068                             int mirror_num, u64 physical_for_dev_replace)
3069 {
3070         struct scrub_copy_nocow_ctx *nocow_ctx;
3071         struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
3072
3073         nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
3074         if (!nocow_ctx) {
3075                 spin_lock(&sctx->stat_lock);
3076                 sctx->stat.malloc_errors++;
3077                 spin_unlock(&sctx->stat_lock);
3078                 return -ENOMEM;
3079         }
3080
3081         scrub_pending_trans_workers_inc(sctx);
3082
3083         nocow_ctx->sctx = sctx;
3084         nocow_ctx->logical = logical;
3085         nocow_ctx->len = len;
3086         nocow_ctx->mirror_num = mirror_num;
3087         nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
3088         nocow_ctx->work.func = copy_nocow_pages_worker;
3089         btrfs_queue_worker(&fs_info->scrub_nocow_workers,
3090                            &nocow_ctx->work);
3091
3092         return 0;
3093 }
3094
3095 static void copy_nocow_pages_worker(struct btrfs_work *work)
3096 {
3097         struct scrub_copy_nocow_ctx *nocow_ctx =
3098                 container_of(work, struct scrub_copy_nocow_ctx, work);
3099         struct scrub_ctx *sctx = nocow_ctx->sctx;
3100         u64 logical = nocow_ctx->logical;
3101         u64 len = nocow_ctx->len;
3102         int mirror_num = nocow_ctx->mirror_num;
3103         u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3104         int ret;
3105         struct btrfs_trans_handle *trans = NULL;
3106         struct btrfs_fs_info *fs_info;
3107         struct btrfs_path *path;
3108         struct btrfs_root *root;
3109         int not_written = 0;
3110
3111         fs_info = sctx->dev_root->fs_info;
3112         root = fs_info->extent_root;
3113
3114         path = btrfs_alloc_path();
3115         if (!path) {
3116                 spin_lock(&sctx->stat_lock);
3117                 sctx->stat.malloc_errors++;
3118                 spin_unlock(&sctx->stat_lock);
3119                 not_written = 1;
3120                 goto out;
3121         }
3122
3123         trans = btrfs_join_transaction(root);
3124         if (IS_ERR(trans)) {
3125                 not_written = 1;
3126                 goto out;
3127         }
3128
3129         ret = iterate_inodes_from_logical(logical, fs_info, path,
3130                                           copy_nocow_pages_for_inode,
3131                                           nocow_ctx);
3132         if (ret != 0 && ret != -ENOENT) {
3133                 pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %llu, ret %d\n",
3134                         (unsigned long long)logical,
3135                         (unsigned long long)physical_for_dev_replace,
3136                         (unsigned long long)len,
3137                         (unsigned long long)mirror_num, ret);
3138                 not_written = 1;
3139                 goto out;
3140         }
3141
3142 out:
3143         if (trans && !IS_ERR(trans))
3144                 btrfs_end_transaction(trans, root);
3145         if (not_written)
3146                 btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
3147                                             num_uncorrectable_read_errors);
3148
3149         btrfs_free_path(path);
3150         kfree(nocow_ctx);
3151
3152         scrub_pending_trans_workers_dec(sctx);
3153 }
3154
3155 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
3156 {
3157         unsigned long index;
3158         struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
3159         int ret = 0;
3160         struct btrfs_key key;
3161         struct inode *inode = NULL;
3162         struct btrfs_root *local_root;
3163         u64 physical_for_dev_replace;
3164         u64 len;
3165         struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
3166
3167         key.objectid = root;
3168         key.type = BTRFS_ROOT_ITEM_KEY;
3169         key.offset = (u64)-1;
3170         local_root = btrfs_read_fs_root_no_name(fs_info, &key);
3171         if (IS_ERR(local_root))
3172                 return PTR_ERR(local_root);
3173
3174         key.type = BTRFS_INODE_ITEM_KEY;
3175         key.objectid = inum;
3176         key.offset = 0;
3177         inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
3178         if (IS_ERR(inode))
3179                 return PTR_ERR(inode);
3180
3181         physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3182         len = nocow_ctx->len;
3183         while (len >= PAGE_CACHE_SIZE) {
3184                 struct page *page = NULL;
3185                 int ret_sub;
3186
3187                 index = offset >> PAGE_CACHE_SHIFT;
3188
3189                 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
3190                 if (!page) {
3191                         pr_err("find_or_create_page() failed\n");
3192                         ret = -ENOMEM;
3193                         goto next_page;
3194                 }
3195
3196                 if (PageUptodate(page)) {
3197                         if (PageDirty(page))
3198                                 goto next_page;
3199                 } else {
3200                         ClearPageError(page);
3201                         ret_sub = extent_read_full_page(&BTRFS_I(inode)->
3202                                                          io_tree,
3203                                                         page, btrfs_get_extent,
3204                                                         nocow_ctx->mirror_num);
3205                         if (ret_sub) {
3206                                 ret = ret_sub;
3207                                 goto next_page;
3208                         }