2 * raid5.c : Multiple Devices driver for Linux
3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
4 * Copyright (C) 1999, 2000 Ingo Molnar
5 * Copyright (C) 2002, 2003 H. Peter Anvin
7 * RAID-4/5/6 management functions.
8 * Thanks to Penguin Computing for making the RAID-6 development possible
9 * by donating a test server!
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2, or (at your option)
16 * You should have received a copy of the GNU General Public License
17 * (for example /usr/src/linux/COPYING); if not, write to the Free
18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 * The sequencing for updating the bitmap reliably is a little
25 * subtle (and I got it wrong the first time) so it deserves some
28 * We group bitmap updates into batches. Each batch has a number.
29 * We may write out several batches at once, but that isn't very important.
30 * conf->seq_write is the number of the last batch successfully written.
31 * conf->seq_flush is the number of the last batch that was closed to
33 * When we discover that we will need to write to any block in a stripe
34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
35 * the number of the batch it will be in. This is seq_flush+1.
36 * When we are ready to do a write, if that batch hasn't been written yet,
37 * we plug the array and queue the stripe for later.
38 * When an unplug happens, we increment bm_flush, thus closing the current
40 * When we notice that bm_flush > bm_write, we write out all pending updates
41 * to the bitmap, and advance bm_write to where bm_flush was.
42 * This may occasionally write a bit out twice, but is sure never to
46 #include <linux/blkdev.h>
47 #include <linux/kthread.h>
48 #include <linux/raid/pq.h>
49 #include <linux/async_tx.h>
50 #include <linux/async.h>
51 #include <linux/seq_file.h>
52 #include <linux/cpu.h>
53 #include <linux/slab.h>
54 #include <linux/ratelimit.h>
64 #define NR_STRIPES 256
65 #define STRIPE_SIZE PAGE_SIZE
66 #define STRIPE_SHIFT (PAGE_SHIFT - 9)
67 #define STRIPE_SECTORS (STRIPE_SIZE>>9)
68 #define IO_THRESHOLD 1
69 #define BYPASS_THRESHOLD 1
70 #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head))
71 #define HASH_MASK (NR_HASH - 1)
73 #define stripe_hash(conf, sect) (&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]))
75 /* bio's attached to a stripe+device for I/O are linked together in bi_sector
76 * order without overlap. There may be several bio's per stripe+device, and
77 * a bio could span several devices.
78 * When walking this list for a particular stripe+device, we must never proceed
79 * beyond a bio that extends past this device, as the next bio might no longer
81 * This macro is used to determine the 'next' bio in the list, given the sector
82 * of the current stripe+device
84 #define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL)
86 * The following can be used to debug the driver
88 #define RAID5_PARANOIA 1
89 #if RAID5_PARANOIA && defined(CONFIG_SMP)
90 # define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock)
92 # define CHECK_DEVLOCK()
101 * We maintain a biased count of active stripes in the bottom 16 bits of
102 * bi_phys_segments, and a count of processed stripes in the upper 16 bits
104 static inline int raid5_bi_phys_segments(struct bio *bio)
106 return bio->bi_phys_segments & 0xffff;
109 static inline int raid5_bi_hw_segments(struct bio *bio)
111 return (bio->bi_phys_segments >> 16) & 0xffff;
114 static inline int raid5_dec_bi_phys_segments(struct bio *bio)
116 --bio->bi_phys_segments;
117 return raid5_bi_phys_segments(bio);
120 static inline int raid5_dec_bi_hw_segments(struct bio *bio)
122 unsigned short val = raid5_bi_hw_segments(bio);
125 bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio);
129 static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt)
131 bio->bi_phys_segments = raid5_bi_phys_segments(bio) | (cnt << 16);
134 /* Find first data disk in a raid6 stripe */
135 static inline int raid6_d0(struct stripe_head *sh)
138 /* ddf always start from first device */
140 /* md starts just after Q block */
141 if (sh->qd_idx == sh->disks - 1)
144 return sh->qd_idx + 1;
146 static inline int raid6_next_disk(int disk, int raid_disks)
149 return (disk < raid_disks) ? disk : 0;
152 /* When walking through the disks in a raid5, starting at raid6_d0,
153 * We need to map each disk to a 'slot', where the data disks are slot
154 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk
155 * is raid_disks-1. This help does that mapping.
157 static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
158 int *count, int syndrome_disks)
164 if (idx == sh->pd_idx)
165 return syndrome_disks;
166 if (idx == sh->qd_idx)
167 return syndrome_disks + 1;
173 static void return_io(struct bio *return_bi)
175 struct bio *bi = return_bi;
178 return_bi = bi->bi_next;
186 static void print_raid5_conf (raid5_conf_t *conf);
188 static int stripe_operations_active(struct stripe_head *sh)
190 return sh->check_state || sh->reconstruct_state ||
191 test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
192 test_bit(STRIPE_COMPUTE_RUN, &sh->state);
195 static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
197 if (atomic_dec_and_test(&sh->count)) {
198 BUG_ON(!list_empty(&sh->lru));
199 BUG_ON(atomic_read(&conf->active_stripes)==0);
200 if (test_bit(STRIPE_HANDLE, &sh->state)) {
201 if (test_bit(STRIPE_DELAYED, &sh->state))
202 list_add_tail(&sh->lru, &conf->delayed_list);
203 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
204 sh->bm_seq - conf->seq_write > 0)
205 list_add_tail(&sh->lru, &conf->bitmap_list);
207 clear_bit(STRIPE_BIT_DELAY, &sh->state);
208 list_add_tail(&sh->lru, &conf->handle_list);
210 md_wakeup_thread(conf->mddev->thread);
212 BUG_ON(stripe_operations_active(sh));
213 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
214 atomic_dec(&conf->preread_active_stripes);
215 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
216 md_wakeup_thread(conf->mddev->thread);
218 atomic_dec(&conf->active_stripes);
219 if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
220 list_add_tail(&sh->lru, &conf->inactive_list);
221 wake_up(&conf->wait_for_stripe);
222 if (conf->retry_read_aligned)
223 md_wakeup_thread(conf->mddev->thread);
229 static void release_stripe(struct stripe_head *sh)
231 raid5_conf_t *conf = sh->raid_conf;
234 spin_lock_irqsave(&conf->device_lock, flags);
235 __release_stripe(conf, sh);
236 spin_unlock_irqrestore(&conf->device_lock, flags);
239 static inline void remove_hash(struct stripe_head *sh)
241 pr_debug("remove_hash(), stripe %llu\n",
242 (unsigned long long)sh->sector);
244 hlist_del_init(&sh->hash);
247 static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
249 struct hlist_head *hp = stripe_hash(conf, sh->sector);
251 pr_debug("insert_hash(), stripe %llu\n",
252 (unsigned long long)sh->sector);
255 hlist_add_head(&sh->hash, hp);
259 /* find an idle stripe, make sure it is unhashed, and return it. */
260 static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
262 struct stripe_head *sh = NULL;
263 struct list_head *first;
266 if (list_empty(&conf->inactive_list))
268 first = conf->inactive_list.next;
269 sh = list_entry(first, struct stripe_head, lru);
270 list_del_init(first);
272 atomic_inc(&conf->active_stripes);
277 static void shrink_buffers(struct stripe_head *sh)
281 int num = sh->raid_conf->pool_size;
283 for (i = 0; i < num ; i++) {
287 sh->dev[i].page = NULL;
292 static int grow_buffers(struct stripe_head *sh)
295 int num = sh->raid_conf->pool_size;
297 for (i = 0; i < num; i++) {
300 if (!(page = alloc_page(GFP_KERNEL))) {
303 sh->dev[i].page = page;
308 static void raid5_build_block(struct stripe_head *sh, int i, int previous);
309 static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
310 struct stripe_head *sh);
312 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
314 raid5_conf_t *conf = sh->raid_conf;
317 BUG_ON(atomic_read(&sh->count) != 0);
318 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
319 BUG_ON(stripe_operations_active(sh));
322 pr_debug("init_stripe called, stripe %llu\n",
323 (unsigned long long)sh->sector);
327 sh->generation = conf->generation - previous;
328 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
330 stripe_set_idx(sector, conf, previous, sh);
334 for (i = sh->disks; i--; ) {
335 struct r5dev *dev = &sh->dev[i];
337 if (dev->toread || dev->read || dev->towrite || dev->written ||
338 test_bit(R5_LOCKED, &dev->flags)) {
339 printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n",
340 (unsigned long long)sh->sector, i, dev->toread,
341 dev->read, dev->towrite, dev->written,
342 test_bit(R5_LOCKED, &dev->flags));
346 raid5_build_block(sh, i, previous);
348 insert_hash(conf, sh);
351 static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector,
354 struct stripe_head *sh;
355 struct hlist_node *hn;
358 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
359 hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
360 if (sh->sector == sector && sh->generation == generation)
362 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
367 * Need to check if array has failed when deciding whether to:
369 * - remove non-faulty devices
372 * This determination is simple when no reshape is happening.
373 * However if there is a reshape, we need to carefully check
374 * both the before and after sections.
375 * This is because some failed devices may only affect one
376 * of the two sections, and some non-in_sync devices may
377 * be insync in the section most affected by failed devices.
379 static int has_failed(raid5_conf_t *conf)
383 if (conf->mddev->reshape_position == MaxSector)
384 return conf->mddev->degraded > conf->max_degraded;
388 for (i = 0; i < conf->previous_raid_disks; i++) {
389 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
390 if (!rdev || test_bit(Faulty, &rdev->flags))
392 else if (test_bit(In_sync, &rdev->flags))
395 /* not in-sync or faulty.
396 * If the reshape increases the number of devices,
397 * this is being recovered by the reshape, so
398 * this 'previous' section is not in_sync.
399 * If the number of devices is being reduced however,
400 * the device can only be part of the array if
401 * we are reverting a reshape, so this section will
404 if (conf->raid_disks >= conf->previous_raid_disks)
408 if (degraded > conf->max_degraded)
412 for (i = 0; i < conf->raid_disks; i++) {
413 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
414 if (!rdev || test_bit(Faulty, &rdev->flags))
416 else if (test_bit(In_sync, &rdev->flags))
419 /* not in-sync or faulty.
420 * If reshape increases the number of devices, this
421 * section has already been recovered, else it
422 * almost certainly hasn't.
424 if (conf->raid_disks <= conf->previous_raid_disks)
428 if (degraded > conf->max_degraded)
433 static struct stripe_head *
434 get_active_stripe(raid5_conf_t *conf, sector_t sector,
435 int previous, int noblock, int noquiesce)
437 struct stripe_head *sh;
439 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
441 spin_lock_irq(&conf->device_lock);
444 wait_event_lock_irq(conf->wait_for_stripe,
445 conf->quiesce == 0 || noquiesce,
446 conf->device_lock, /* nothing */);
447 sh = __find_stripe(conf, sector, conf->generation - previous);
449 if (!conf->inactive_blocked)
450 sh = get_free_stripe(conf);
451 if (noblock && sh == NULL)
454 conf->inactive_blocked = 1;
455 wait_event_lock_irq(conf->wait_for_stripe,
456 !list_empty(&conf->inactive_list) &&
457 (atomic_read(&conf->active_stripes)
458 < (conf->max_nr_stripes *3/4)
459 || !conf->inactive_blocked),
462 conf->inactive_blocked = 0;
464 init_stripe(sh, sector, previous);
466 if (atomic_read(&sh->count)) {
467 BUG_ON(!list_empty(&sh->lru)
468 && !test_bit(STRIPE_EXPANDING, &sh->state));
470 if (!test_bit(STRIPE_HANDLE, &sh->state))
471 atomic_inc(&conf->active_stripes);
472 if (list_empty(&sh->lru) &&
473 !test_bit(STRIPE_EXPANDING, &sh->state))
475 list_del_init(&sh->lru);
478 } while (sh == NULL);
481 atomic_inc(&sh->count);
483 spin_unlock_irq(&conf->device_lock);
488 raid5_end_read_request(struct bio *bi, int error);
490 raid5_end_write_request(struct bio *bi, int error);
492 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
494 raid5_conf_t *conf = sh->raid_conf;
495 int i, disks = sh->disks;
499 for (i = disks; i--; ) {
503 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
504 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
508 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
513 bi = &sh->dev[i].req;
517 bi->bi_end_io = raid5_end_write_request;
519 bi->bi_end_io = raid5_end_read_request;
522 rdev = rcu_dereference(conf->disks[i].rdev);
523 if (rdev && test_bit(Faulty, &rdev->flags))
526 atomic_inc(&rdev->nr_pending);
529 /* We have already checked bad blocks for reads. Now
530 * need to check for writes.
532 while ((rw & WRITE) && rdev &&
533 test_bit(WriteErrorSeen, &rdev->flags)) {
536 int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
537 &first_bad, &bad_sectors);
542 set_bit(BlockedBadBlocks, &rdev->flags);
543 if (!conf->mddev->external &&
544 conf->mddev->flags) {
545 /* It is very unlikely, but we might
546 * still need to write out the
547 * bad block log - better give it
549 md_check_recovery(conf->mddev);
551 md_wait_for_blocked_rdev(rdev, conf->mddev);
553 /* Acknowledged bad block - skip the write */
554 rdev_dec_pending(rdev, conf->mddev);
560 if (s->syncing || s->expanding || s->expanded)
561 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
563 set_bit(STRIPE_IO_STARTED, &sh->state);
565 bi->bi_bdev = rdev->bdev;
566 pr_debug("%s: for %llu schedule op %ld on disc %d\n",
567 __func__, (unsigned long long)sh->sector,
569 atomic_inc(&sh->count);
570 bi->bi_sector = sh->sector + rdev->data_offset;
571 bi->bi_flags = 1 << BIO_UPTODATE;
575 bi->bi_io_vec = &sh->dev[i].vec;
576 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
577 bi->bi_io_vec[0].bv_offset = 0;
578 bi->bi_size = STRIPE_SIZE;
580 generic_make_request(bi);
583 set_bit(STRIPE_DEGRADED, &sh->state);
584 pr_debug("skip op %ld on disc %d for sector %llu\n",
585 bi->bi_rw, i, (unsigned long long)sh->sector);
586 clear_bit(R5_LOCKED, &sh->dev[i].flags);
587 set_bit(STRIPE_HANDLE, &sh->state);
592 static struct dma_async_tx_descriptor *
593 async_copy_data(int frombio, struct bio *bio, struct page *page,
594 sector_t sector, struct dma_async_tx_descriptor *tx)
597 struct page *bio_page;
600 struct async_submit_ctl submit;
601 enum async_tx_flags flags = 0;
603 if (bio->bi_sector >= sector)
604 page_offset = (signed)(bio->bi_sector - sector) * 512;
606 page_offset = (signed)(sector - bio->bi_sector) * -512;
609 flags |= ASYNC_TX_FENCE;
610 init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
612 bio_for_each_segment(bvl, bio, i) {
613 int len = bvl->bv_len;
617 if (page_offset < 0) {
618 b_offset = -page_offset;
619 page_offset += b_offset;
623 if (len > 0 && page_offset + len > STRIPE_SIZE)
624 clen = STRIPE_SIZE - page_offset;
629 b_offset += bvl->bv_offset;
630 bio_page = bvl->bv_page;
632 tx = async_memcpy(page, bio_page, page_offset,
633 b_offset, clen, &submit);
635 tx = async_memcpy(bio_page, page, b_offset,
636 page_offset, clen, &submit);
638 /* chain the operations */
639 submit.depend_tx = tx;
641 if (clen < len) /* hit end of page */
649 static void ops_complete_biofill(void *stripe_head_ref)
651 struct stripe_head *sh = stripe_head_ref;
652 struct bio *return_bi = NULL;
653 raid5_conf_t *conf = sh->raid_conf;
656 pr_debug("%s: stripe %llu\n", __func__,
657 (unsigned long long)sh->sector);
659 /* clear completed biofills */
660 spin_lock_irq(&conf->device_lock);
661 for (i = sh->disks; i--; ) {
662 struct r5dev *dev = &sh->dev[i];
664 /* acknowledge completion of a biofill operation */
665 /* and check if we need to reply to a read request,
666 * new R5_Wantfill requests are held off until
667 * !STRIPE_BIOFILL_RUN
669 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
670 struct bio *rbi, *rbi2;
675 while (rbi && rbi->bi_sector <
676 dev->sector + STRIPE_SECTORS) {
677 rbi2 = r5_next_bio(rbi, dev->sector);
678 if (!raid5_dec_bi_phys_segments(rbi)) {
679 rbi->bi_next = return_bi;
686 spin_unlock_irq(&conf->device_lock);
687 clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
689 return_io(return_bi);
691 set_bit(STRIPE_HANDLE, &sh->state);
695 static void ops_run_biofill(struct stripe_head *sh)
697 struct dma_async_tx_descriptor *tx = NULL;
698 raid5_conf_t *conf = sh->raid_conf;
699 struct async_submit_ctl submit;
702 pr_debug("%s: stripe %llu\n", __func__,
703 (unsigned long long)sh->sector);
705 for (i = sh->disks; i--; ) {
706 struct r5dev *dev = &sh->dev[i];
707 if (test_bit(R5_Wantfill, &dev->flags)) {
709 spin_lock_irq(&conf->device_lock);
710 dev->read = rbi = dev->toread;
712 spin_unlock_irq(&conf->device_lock);
713 while (rbi && rbi->bi_sector <
714 dev->sector + STRIPE_SECTORS) {
715 tx = async_copy_data(0, rbi, dev->page,
717 rbi = r5_next_bio(rbi, dev->sector);
722 atomic_inc(&sh->count);
723 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
724 async_trigger_callback(&submit);
727 static void mark_target_uptodate(struct stripe_head *sh, int target)
734 tgt = &sh->dev[target];
735 set_bit(R5_UPTODATE, &tgt->flags);
736 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
737 clear_bit(R5_Wantcompute, &tgt->flags);
740 static void ops_complete_compute(void *stripe_head_ref)
742 struct stripe_head *sh = stripe_head_ref;
744 pr_debug("%s: stripe %llu\n", __func__,
745 (unsigned long long)sh->sector);
747 /* mark the computed target(s) as uptodate */
748 mark_target_uptodate(sh, sh->ops.target);
749 mark_target_uptodate(sh, sh->ops.target2);
751 clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
752 if (sh->check_state == check_state_compute_run)
753 sh->check_state = check_state_compute_result;
754 set_bit(STRIPE_HANDLE, &sh->state);
758 /* return a pointer to the address conversion region of the scribble buffer */
759 static addr_conv_t *to_addr_conv(struct stripe_head *sh,
760 struct raid5_percpu *percpu)
762 return percpu->scribble + sizeof(struct page *) * (sh->disks + 2);
765 static struct dma_async_tx_descriptor *
766 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
768 int disks = sh->disks;
769 struct page **xor_srcs = percpu->scribble;
770 int target = sh->ops.target;
771 struct r5dev *tgt = &sh->dev[target];
772 struct page *xor_dest = tgt->page;
774 struct dma_async_tx_descriptor *tx;
775 struct async_submit_ctl submit;
778 pr_debug("%s: stripe %llu block: %d\n",
779 __func__, (unsigned long long)sh->sector, target);
780 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
782 for (i = disks; i--; )
784 xor_srcs[count++] = sh->dev[i].page;
786 atomic_inc(&sh->count);
788 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
789 ops_complete_compute, sh, to_addr_conv(sh, percpu));
790 if (unlikely(count == 1))
791 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
793 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
798 /* set_syndrome_sources - populate source buffers for gen_syndrome
799 * @srcs - (struct page *) array of size sh->disks
800 * @sh - stripe_head to parse
802 * Populates srcs in proper layout order for the stripe and returns the
803 * 'count' of sources to be used in a call to async_gen_syndrome. The P
804 * destination buffer is recorded in srcs[count] and the Q destination
805 * is recorded in srcs[count+1]].
807 static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
809 int disks = sh->disks;
810 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
811 int d0_idx = raid6_d0(sh);
815 for (i = 0; i < disks; i++)
821 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
823 srcs[slot] = sh->dev[i].page;
824 i = raid6_next_disk(i, disks);
825 } while (i != d0_idx);
827 return syndrome_disks;
830 static struct dma_async_tx_descriptor *
831 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
833 int disks = sh->disks;
834 struct page **blocks = percpu->scribble;
836 int qd_idx = sh->qd_idx;
837 struct dma_async_tx_descriptor *tx;
838 struct async_submit_ctl submit;
844 if (sh->ops.target < 0)
845 target = sh->ops.target2;
846 else if (sh->ops.target2 < 0)
847 target = sh->ops.target;
849 /* we should only have one valid target */
852 pr_debug("%s: stripe %llu block: %d\n",
853 __func__, (unsigned long long)sh->sector, target);
855 tgt = &sh->dev[target];
856 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
859 atomic_inc(&sh->count);
861 if (target == qd_idx) {
862 count = set_syndrome_sources(blocks, sh);
863 blocks[count] = NULL; /* regenerating p is not necessary */
864 BUG_ON(blocks[count+1] != dest); /* q should already be set */
865 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
866 ops_complete_compute, sh,
867 to_addr_conv(sh, percpu));
868 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
870 /* Compute any data- or p-drive using XOR */
872 for (i = disks; i-- ; ) {
873 if (i == target || i == qd_idx)
875 blocks[count++] = sh->dev[i].page;
878 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
879 NULL, ops_complete_compute, sh,
880 to_addr_conv(sh, percpu));
881 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
887 static struct dma_async_tx_descriptor *
888 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
890 int i, count, disks = sh->disks;
891 int syndrome_disks = sh->ddf_layout ? disks : disks-2;
892 int d0_idx = raid6_d0(sh);
893 int faila = -1, failb = -1;
894 int target = sh->ops.target;
895 int target2 = sh->ops.target2;
896 struct r5dev *tgt = &sh->dev[target];
897 struct r5dev *tgt2 = &sh->dev[target2];
898 struct dma_async_tx_descriptor *tx;
899 struct page **blocks = percpu->scribble;
900 struct async_submit_ctl submit;
902 pr_debug("%s: stripe %llu block1: %d block2: %d\n",
903 __func__, (unsigned long long)sh->sector, target, target2);
904 BUG_ON(target < 0 || target2 < 0);
905 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
906 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
908 /* we need to open-code set_syndrome_sources to handle the
909 * slot number conversion for 'faila' and 'failb'
911 for (i = 0; i < disks ; i++)
916 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
918 blocks[slot] = sh->dev[i].page;
924 i = raid6_next_disk(i, disks);
925 } while (i != d0_idx);
927 BUG_ON(faila == failb);
930 pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
931 __func__, (unsigned long long)sh->sector, faila, failb);
933 atomic_inc(&sh->count);
935 if (failb == syndrome_disks+1) {
936 /* Q disk is one of the missing disks */
937 if (faila == syndrome_disks) {
938 /* Missing P+Q, just recompute */
939 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
940 ops_complete_compute, sh,
941 to_addr_conv(sh, percpu));
942 return async_gen_syndrome(blocks, 0, syndrome_disks+2,
943 STRIPE_SIZE, &submit);
947 int qd_idx = sh->qd_idx;
949 /* Missing D+Q: recompute D from P, then recompute Q */
950 if (target == qd_idx)
951 data_target = target2;
953 data_target = target;
956 for (i = disks; i-- ; ) {
957 if (i == data_target || i == qd_idx)
959 blocks[count++] = sh->dev[i].page;
961 dest = sh->dev[data_target].page;
962 init_async_submit(&submit,
963 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
965 to_addr_conv(sh, percpu));
966 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
969 count = set_syndrome_sources(blocks, sh);
970 init_async_submit(&submit, ASYNC_TX_FENCE, tx,
971 ops_complete_compute, sh,
972 to_addr_conv(sh, percpu));
973 return async_gen_syndrome(blocks, 0, count+2,
974 STRIPE_SIZE, &submit);
977 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
978 ops_complete_compute, sh,
979 to_addr_conv(sh, percpu));
980 if (failb == syndrome_disks) {
981 /* We're missing D+P. */
982 return async_raid6_datap_recov(syndrome_disks+2,
986 /* We're missing D+D. */
987 return async_raid6_2data_recov(syndrome_disks+2,
988 STRIPE_SIZE, faila, failb,
995 static void ops_complete_prexor(void *stripe_head_ref)
997 struct stripe_head *sh = stripe_head_ref;
999 pr_debug("%s: stripe %llu\n", __func__,
1000 (unsigned long long)sh->sector);
1003 static struct dma_async_tx_descriptor *
1004 ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
1005 struct dma_async_tx_descriptor *tx)
1007 int disks = sh->disks;
1008 struct page **xor_srcs = percpu->scribble;
1009 int count = 0, pd_idx = sh->pd_idx, i;
1010 struct async_submit_ctl submit;
1012 /* existing parity data subtracted */
1013 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1015 pr_debug("%s: stripe %llu\n", __func__,
1016 (unsigned long long)sh->sector);
1018 for (i = disks; i--; ) {
1019 struct r5dev *dev = &sh->dev[i];
1020 /* Only process blocks that are known to be uptodate */
1021 if (test_bit(R5_Wantdrain, &dev->flags))
1022 xor_srcs[count++] = dev->page;
1025 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
1026 ops_complete_prexor, sh, to_addr_conv(sh, percpu));
1027 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1032 static struct dma_async_tx_descriptor *
1033 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1035 int disks = sh->disks;
1038 pr_debug("%s: stripe %llu\n", __func__,
1039 (unsigned long long)sh->sector);
1041 for (i = disks; i--; ) {
1042 struct r5dev *dev = &sh->dev[i];
1045 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
1048 spin_lock_irq(&sh->raid_conf->device_lock);
1049 chosen = dev->towrite;
1050 dev->towrite = NULL;
1051 BUG_ON(dev->written);
1052 wbi = dev->written = chosen;
1053 spin_unlock_irq(&sh->raid_conf->device_lock);
1055 while (wbi && wbi->bi_sector <
1056 dev->sector + STRIPE_SECTORS) {
1057 if (wbi->bi_rw & REQ_FUA)
1058 set_bit(R5_WantFUA, &dev->flags);
1059 tx = async_copy_data(1, wbi, dev->page,
1061 wbi = r5_next_bio(wbi, dev->sector);
1069 static void ops_complete_reconstruct(void *stripe_head_ref)
1071 struct stripe_head *sh = stripe_head_ref;
1072 int disks = sh->disks;
1073 int pd_idx = sh->pd_idx;
1074 int qd_idx = sh->qd_idx;
1078 pr_debug("%s: stripe %llu\n", __func__,
1079 (unsigned long long)sh->sector);
1081 for (i = disks; i--; )
1082 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
1084 for (i = disks; i--; ) {
1085 struct r5dev *dev = &sh->dev[i];
1087 if (dev->written || i == pd_idx || i == qd_idx) {
1088 set_bit(R5_UPTODATE, &dev->flags);
1090 set_bit(R5_WantFUA, &dev->flags);
1094 if (sh->reconstruct_state == reconstruct_state_drain_run)
1095 sh->reconstruct_state = reconstruct_state_drain_result;
1096 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
1097 sh->reconstruct_state = reconstruct_state_prexor_drain_result;
1099 BUG_ON(sh->reconstruct_state != reconstruct_state_run);
1100 sh->reconstruct_state = reconstruct_state_result;
1103 set_bit(STRIPE_HANDLE, &sh->state);
1108 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
1109 struct dma_async_tx_descriptor *tx)
1111 int disks = sh->disks;
1112 struct page **xor_srcs = percpu->scribble;
1113 struct async_submit_ctl submit;
1114 int count = 0, pd_idx = sh->pd_idx, i;
1115 struct page *xor_dest;
1117 unsigned long flags;
1119 pr_debug("%s: stripe %llu\n", __func__,
1120 (unsigned long long)sh->sector);
1122 /* check if prexor is active which means only process blocks
1123 * that are part of a read-modify-write (written)
1125 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
1127 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1128 for (i = disks; i--; ) {
1129 struct r5dev *dev = &sh->dev[i];
1131 xor_srcs[count++] = dev->page;
1134 xor_dest = sh->dev[pd_idx].page;
1135 for (i = disks; i--; ) {
1136 struct r5dev *dev = &sh->dev[i];
1138 xor_srcs[count++] = dev->page;
1142 /* 1/ if we prexor'd then the dest is reused as a source
1143 * 2/ if we did not prexor then we are redoing the parity
1144 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
1145 * for the synchronous xor case
1147 flags = ASYNC_TX_ACK |
1148 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
1150 atomic_inc(&sh->count);
1152 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh,
1153 to_addr_conv(sh, percpu));
1154 if (unlikely(count == 1))
1155 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
1157 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1161 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
1162 struct dma_async_tx_descriptor *tx)
1164 struct async_submit_ctl submit;
1165 struct page **blocks = percpu->scribble;
1168 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
1170 count = set_syndrome_sources(blocks, sh);
1172 atomic_inc(&sh->count);
1174 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
1175 sh, to_addr_conv(sh, percpu));
1176 async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
1179 static void ops_complete_check(void *stripe_head_ref)
1181 struct stripe_head *sh = stripe_head_ref;
1183 pr_debug("%s: stripe %llu\n", __func__,
1184 (unsigned long long)sh->sector);
1186 sh->check_state = check_state_check_result;
1187 set_bit(STRIPE_HANDLE, &sh->state);
1191 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
1193 int disks = sh->disks;
1194 int pd_idx = sh->pd_idx;
1195 int qd_idx = sh->qd_idx;
1196 struct page *xor_dest;
1197 struct page **xor_srcs = percpu->scribble;
1198 struct dma_async_tx_descriptor *tx;
1199 struct async_submit_ctl submit;
1203 pr_debug("%s: stripe %llu\n", __func__,
1204 (unsigned long long)sh->sector);
1207 xor_dest = sh->dev[pd_idx].page;
1208 xor_srcs[count++] = xor_dest;
1209 for (i = disks; i--; ) {
1210 if (i == pd_idx || i == qd_idx)
1212 xor_srcs[count++] = sh->dev[i].page;
1215 init_async_submit(&submit, 0, NULL, NULL, NULL,
1216 to_addr_conv(sh, percpu));
1217 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
1218 &sh->ops.zero_sum_result, &submit);
1220 atomic_inc(&sh->count);
1221 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
1222 tx = async_trigger_callback(&submit);
1225 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
1227 struct page **srcs = percpu->scribble;
1228 struct async_submit_ctl submit;
1231 pr_debug("%s: stripe %llu checkp: %d\n", __func__,
1232 (unsigned long long)sh->sector, checkp);
1234 count = set_syndrome_sources(srcs, sh);
1238 atomic_inc(&sh->count);
1239 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
1240 sh, to_addr_conv(sh, percpu));
1241 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
1242 &sh->ops.zero_sum_result, percpu->spare_page, &submit);
1245 static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1247 int overlap_clear = 0, i, disks = sh->disks;
1248 struct dma_async_tx_descriptor *tx = NULL;
1249 raid5_conf_t *conf = sh->raid_conf;
1250 int level = conf->level;
1251 struct raid5_percpu *percpu;
1255 percpu = per_cpu_ptr(conf->percpu, cpu);
1256 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
1257 ops_run_biofill(sh);
1261 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
1263 tx = ops_run_compute5(sh, percpu);
1265 if (sh->ops.target2 < 0 || sh->ops.target < 0)
1266 tx = ops_run_compute6_1(sh, percpu);
1268 tx = ops_run_compute6_2(sh, percpu);
1270 /* terminate the chain if reconstruct is not set to be run */
1271 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
1275 if (test_bit(STRIPE_OP_PREXOR, &ops_request))
1276 tx = ops_run_prexor(sh, percpu, tx);
1278 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
1279 tx = ops_run_biodrain(sh, tx);
1283 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
1285 ops_run_reconstruct5(sh, percpu, tx);
1287 ops_run_reconstruct6(sh, percpu, tx);
1290 if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
1291 if (sh->check_state == check_state_run)
1292 ops_run_check_p(sh, percpu);
1293 else if (sh->check_state == check_state_run_q)
1294 ops_run_check_pq(sh, percpu, 0);
1295 else if (sh->check_state == check_state_run_pq)
1296 ops_run_check_pq(sh, percpu, 1);
1302 for (i = disks; i--; ) {
1303 struct r5dev *dev = &sh->dev[i];
1304 if (test_and_clear_bit(R5_Overlap, &dev->flags))
1305 wake_up(&sh->raid_conf->wait_for_overlap);
1310 #ifdef CONFIG_MULTICORE_RAID456
1311 static void async_run_ops(void *param, async_cookie_t cookie)
1313 struct stripe_head *sh = param;
1314 unsigned long ops_request = sh->ops.request;
1316 clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state);
1317 wake_up(&sh->ops.wait_for_ops);
1319 __raid_run_ops(sh, ops_request);
1323 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1325 /* since handle_stripe can be called outside of raid5d context
1326 * we need to ensure sh->ops.request is de-staged before another
1329 wait_event(sh->ops.wait_for_ops,
1330 !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state));
1331 sh->ops.request = ops_request;
1333 atomic_inc(&sh->count);
1334 async_schedule(async_run_ops, sh);
1337 #define raid_run_ops __raid_run_ops
1340 static int grow_one_stripe(raid5_conf_t *conf)
1342 struct stripe_head *sh;
1343 sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL);
1347 sh->raid_conf = conf;
1348 #ifdef CONFIG_MULTICORE_RAID456
1349 init_waitqueue_head(&sh->ops.wait_for_ops);
1352 if (grow_buffers(sh)) {
1354 kmem_cache_free(conf->slab_cache, sh);
1357 /* we just created an active stripe so... */
1358 atomic_set(&sh->count, 1);
1359 atomic_inc(&conf->active_stripes);
1360 INIT_LIST_HEAD(&sh->lru);
1365 static int grow_stripes(raid5_conf_t *conf, int num)
1367 struct kmem_cache *sc;
1368 int devs = max(conf->raid_disks, conf->previous_raid_disks);
1370 if (conf->mddev->gendisk)
1371 sprintf(conf->cache_name[0],
1372 "raid%d-%s", conf->level, mdname(conf->mddev));
1374 sprintf(conf->cache_name[0],
1375 "raid%d-%p", conf->level, conf->mddev);
1376 sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]);
1378 conf->active_name = 0;
1379 sc = kmem_cache_create(conf->cache_name[conf->active_name],
1380 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
1384 conf->slab_cache = sc;
1385 conf->pool_size = devs;
1387 if (!grow_one_stripe(conf))
1393 * scribble_len - return the required size of the scribble region
1394 * @num - total number of disks in the array
1396 * The size must be enough to contain:
1397 * 1/ a struct page pointer for each device in the array +2
1398 * 2/ room to convert each entry in (1) to its corresponding dma
1399 * (dma_map_page()) or page (page_address()) address.
1401 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we
1402 * calculate over all devices (not just the data blocks), using zeros in place
1403 * of the P and Q blocks.
1405 static size_t scribble_len(int num)
1409 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
1414 static int resize_stripes(raid5_conf_t *conf, int newsize)
1416 /* Make all the stripes able to hold 'newsize' devices.
1417 * New slots in each stripe get 'page' set to a new page.
1419 * This happens in stages:
1420 * 1/ create a new kmem_cache and allocate the required number of
1422 * 2/ gather all the old stripe_heads and tranfer the pages across
1423 * to the new stripe_heads. This will have the side effect of
1424 * freezing the array as once all stripe_heads have been collected,
1425 * no IO will be possible. Old stripe heads are freed once their
1426 * pages have been transferred over, and the old kmem_cache is
1427 * freed when all stripes are done.
1428 * 3/ reallocate conf->disks to be suitable bigger. If this fails,
1429 * we simple return a failre status - no need to clean anything up.
1430 * 4/ allocate new pages for the new slots in the new stripe_heads.
1431 * If this fails, we don't bother trying the shrink the
1432 * stripe_heads down again, we just leave them as they are.
1433 * As each stripe_head is processed the new one is released into
1436 * Once step2 is started, we cannot afford to wait for a write,
1437 * so we use GFP_NOIO allocations.
1439 struct stripe_head *osh, *nsh;
1440 LIST_HEAD(newstripes);
1441 struct disk_info *ndisks;
1444 struct kmem_cache *sc;
1447 if (newsize <= conf->pool_size)
1448 return 0; /* never bother to shrink */
1450 err = md_allow_write(conf->mddev);
1455 sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
1456 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
1461 for (i = conf->max_nr_stripes; i; i--) {
1462 nsh = kmem_cache_zalloc(sc, GFP_KERNEL);
1466 nsh->raid_conf = conf;
1467 #ifdef CONFIG_MULTICORE_RAID456
1468 init_waitqueue_head(&nsh->ops.wait_for_ops);
1471 list_add(&nsh->lru, &newstripes);
1474 /* didn't get enough, give up */
1475 while (!list_empty(&newstripes)) {
1476 nsh = list_entry(newstripes.next, struct stripe_head, lru);
1477 list_del(&nsh->lru);
1478 kmem_cache_free(sc, nsh);
1480 kmem_cache_destroy(sc);
1483 /* Step 2 - Must use GFP_NOIO now.
1484 * OK, we have enough stripes, start collecting inactive
1485 * stripes and copying them over
1487 list_for_each_entry(nsh, &newstripes, lru) {
1488 spin_lock_irq(&conf->device_lock);
1489 wait_event_lock_irq(conf->wait_for_stripe,
1490 !list_empty(&conf->inactive_list),
1493 osh = get_free_stripe(conf);
1494 spin_unlock_irq(&conf->device_lock);
1495 atomic_set(&nsh->count, 1);
1496 for(i=0; i<conf->pool_size; i++)
1497 nsh->dev[i].page = osh->dev[i].page;
1498 for( ; i<newsize; i++)
1499 nsh->dev[i].page = NULL;
1500 kmem_cache_free(conf->slab_cache, osh);
1502 kmem_cache_destroy(conf->slab_cache);
1505 * At this point, we are holding all the stripes so the array
1506 * is completely stalled, so now is a good time to resize
1507 * conf->disks and the scribble region
1509 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
1511 for (i=0; i<conf->raid_disks; i++)
1512 ndisks[i] = conf->disks[i];
1514 conf->disks = ndisks;
1519 conf->scribble_len = scribble_len(newsize);
1520 for_each_present_cpu(cpu) {
1521 struct raid5_percpu *percpu;
1524 percpu = per_cpu_ptr(conf->percpu, cpu);
1525 scribble = kmalloc(conf->scribble_len, GFP_NOIO);
1528 kfree(percpu->scribble);
1529 percpu->scribble = scribble;
1537 /* Step 4, return new stripes to service */
1538 while(!list_empty(&newstripes)) {
1539 nsh = list_entry(newstripes.next, struct stripe_head, lru);
1540 list_del_init(&nsh->lru);
1542 for (i=conf->raid_disks; i < newsize; i++)
1543 if (nsh->dev[i].page == NULL) {
1544 struct page *p = alloc_page(GFP_NOIO);
1545 nsh->dev[i].page = p;
1549 release_stripe(nsh);
1551 /* critical section pass, GFP_NOIO no longer needed */
1553 conf->slab_cache = sc;
1554 conf->active_name = 1-conf->active_name;
1555 conf->pool_size = newsize;
1559 static int drop_one_stripe(raid5_conf_t *conf)
1561 struct stripe_head *sh;
1563 spin_lock_irq(&conf->device_lock);
1564 sh = get_free_stripe(conf);
1565 spin_unlock_irq(&conf->device_lock);
1568 BUG_ON(atomic_read(&sh->count));
1570 kmem_cache_free(conf->slab_cache, sh);
1571 atomic_dec(&conf->active_stripes);
1575 static void shrink_stripes(raid5_conf_t *conf)
1577 while (drop_one_stripe(conf))
1580 if (conf->slab_cache)
1581 kmem_cache_destroy(conf->slab_cache);
1582 conf->slab_cache = NULL;
1585 static void raid5_end_read_request(struct bio * bi, int error)
1587 struct stripe_head *sh = bi->bi_private;
1588 raid5_conf_t *conf = sh->raid_conf;
1589 int disks = sh->disks, i;
1590 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1591 char b[BDEVNAME_SIZE];
1595 for (i=0 ; i<disks; i++)
1596 if (bi == &sh->dev[i].req)
1599 pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n",
1600 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
1608 set_bit(R5_UPTODATE, &sh->dev[i].flags);
1609 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1610 rdev = conf->disks[i].rdev;
1613 "md/raid:%s: read error corrected"
1614 " (%lu sectors at %llu on %s)\n",
1615 mdname(conf->mddev), STRIPE_SECTORS,
1616 (unsigned long long)(sh->sector
1617 + rdev->data_offset),
1618 bdevname(rdev->bdev, b));
1619 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
1620 clear_bit(R5_ReadError, &sh->dev[i].flags);
1621 clear_bit(R5_ReWrite, &sh->dev[i].flags);
1623 if (atomic_read(&conf->disks[i].rdev->read_errors))
1624 atomic_set(&conf->disks[i].rdev->read_errors, 0);
1626 const char *bdn = bdevname(conf->disks[i].rdev->bdev, b);
1628 rdev = conf->disks[i].rdev;
1630 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
1631 atomic_inc(&rdev->read_errors);
1632 if (conf->mddev->degraded >= conf->max_degraded)
1635 "md/raid:%s: read error not correctable "
1636 "(sector %llu on %s).\n",
1637 mdname(conf->mddev),
1638 (unsigned long long)(sh->sector
1639 + rdev->data_offset),
1641 else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
1645 "md/raid:%s: read error NOT corrected!! "
1646 "(sector %llu on %s).\n",
1647 mdname(conf->mddev),
1648 (unsigned long long)(sh->sector
1649 + rdev->data_offset),
1651 else if (atomic_read(&rdev->read_errors)
1652 > conf->max_nr_stripes)
1654 "md/raid:%s: Too many read errors, failing device %s.\n",
1655 mdname(conf->mddev), bdn);
1659 set_bit(R5_ReadError, &sh->dev[i].flags);
1661 clear_bit(R5_ReadError, &sh->dev[i].flags);
1662 clear_bit(R5_ReWrite, &sh->dev[i].flags);
1663 md_error(conf->mddev, rdev);
1666 rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
1667 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1668 set_bit(STRIPE_HANDLE, &sh->state);
1672 static void raid5_end_write_request(struct bio *bi, int error)
1674 struct stripe_head *sh = bi->bi_private;
1675 raid5_conf_t *conf = sh->raid_conf;
1676 int disks = sh->disks, i;
1677 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1681 for (i=0 ; i<disks; i++)
1682 if (bi == &sh->dev[i].req)
1685 pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n",
1686 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
1694 set_bit(WriteErrorSeen, &conf->disks[i].rdev->flags);
1695 set_bit(R5_WriteError, &sh->dev[i].flags);
1696 } else if (is_badblock(conf->disks[i].rdev, sh->sector, STRIPE_SECTORS,
1697 &first_bad, &bad_sectors))
1698 set_bit(R5_MadeGood, &sh->dev[i].flags);
1700 rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
1702 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1703 set_bit(STRIPE_HANDLE, &sh->state);
1708 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
1710 static void raid5_build_block(struct stripe_head *sh, int i, int previous)
1712 struct r5dev *dev = &sh->dev[i];
1714 bio_init(&dev->req);
1715 dev->req.bi_io_vec = &dev->vec;
1717 dev->req.bi_max_vecs++;
1718 dev->vec.bv_page = dev->page;
1719 dev->vec.bv_len = STRIPE_SIZE;
1720 dev->vec.bv_offset = 0;
1722 dev->req.bi_sector = sh->sector;
1723 dev->req.bi_private = sh;
1726 dev->sector = compute_blocknr(sh, i, previous);
1729 static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1731 char b[BDEVNAME_SIZE];
1732 raid5_conf_t *conf = mddev->private;
1733 pr_debug("raid456: error called\n");
1735 if (test_and_clear_bit(In_sync, &rdev->flags)) {
1736 unsigned long flags;
1737 spin_lock_irqsave(&conf->device_lock, flags);
1739 spin_unlock_irqrestore(&conf->device_lock, flags);
1741 * if recovery was running, make sure it aborts.
1743 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1745 set_bit(Blocked, &rdev->flags);
1746 set_bit(Faulty, &rdev->flags);
1747 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1749 "md/raid:%s: Disk failure on %s, disabling device.\n"
1750 "md/raid:%s: Operation continuing on %d devices.\n",
1752 bdevname(rdev->bdev, b),
1754 conf->raid_disks - mddev->degraded);
1758 * Input: a 'big' sector number,
1759 * Output: index of the data and parity disk, and the sector # in them.
1761 static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
1762 int previous, int *dd_idx,
1763 struct stripe_head *sh)
1765 sector_t stripe, stripe2;
1766 sector_t chunk_number;
1767 unsigned int chunk_offset;
1770 sector_t new_sector;
1771 int algorithm = previous ? conf->prev_algo
1773 int sectors_per_chunk = previous ? conf->prev_chunk_sectors
1774 : conf->chunk_sectors;
1775 int raid_disks = previous ? conf->previous_raid_disks
1777 int data_disks = raid_disks - conf->max_degraded;
1779 /* First compute the information on this sector */
1782 * Compute the chunk number and the sector offset inside the chunk
1784 chunk_offset = sector_div(r_sector, sectors_per_chunk);
1785 chunk_number = r_sector;
1788 * Compute the stripe number
1790 stripe = chunk_number;
1791 *dd_idx = sector_div(stripe, data_disks);
1794 * Select the parity disk based on the user selected algorithm.
1796 pd_idx = qd_idx = -1;
1797 switch(conf->level) {
1799 pd_idx = data_disks;
1802 switch (algorithm) {
1803 case ALGORITHM_LEFT_ASYMMETRIC:
1804 pd_idx = data_disks - sector_div(stripe2, raid_disks);
1805 if (*dd_idx >= pd_idx)
1808 case ALGORITHM_RIGHT_ASYMMETRIC:
1809 pd_idx = sector_div(stripe2, raid_disks);
1810 if (*dd_idx >= pd_idx)
1813 case ALGORITHM_LEFT_SYMMETRIC:
1814 pd_idx = data_disks - sector_div(stripe2, raid_disks);
1815 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
1817 case ALGORITHM_RIGHT_SYMMETRIC:
1818 pd_idx = sector_div(stripe2, raid_disks);
1819 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
1821 case ALGORITHM_PARITY_0:
1825 case ALGORITHM_PARITY_N:
1826 pd_idx = data_disks;
1834 switch (algorithm) {
1835 case ALGORITHM_LEFT_ASYMMETRIC:
1836 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
1837 qd_idx = pd_idx + 1;
1838 if (pd_idx == raid_disks-1) {
1839 (*dd_idx)++; /* Q D D D P */
1841 } else if (*dd_idx >= pd_idx)
1842 (*dd_idx) += 2; /* D D P Q D */
1844 case ALGORITHM_RIGHT_ASYMMETRIC:
1845 pd_idx = sector_div(stripe2, raid_disks);
1846 qd_idx = pd_idx + 1;
1847 if (pd_idx == raid_disks-1) {
1848 (*dd_idx)++; /* Q D D D P */
1850 } else if (*dd_idx >= pd_idx)
1851 (*dd_idx) += 2; /* D D P Q D */
1853 case ALGORITHM_LEFT_SYMMETRIC:
1854 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
1855 qd_idx = (pd_idx + 1) % raid_disks;
1856 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
1858 case ALGORITHM_RIGHT_SYMMETRIC:
1859 pd_idx = sector_div(stripe2, raid_disks);
1860 qd_idx = (pd_idx + 1) % raid_disks;
1861 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
1864 case ALGORITHM_PARITY_0:
1869 case ALGORITHM_PARITY_N:
1870 pd_idx = data_disks;
1871 qd_idx = data_disks + 1;
1874 case ALGORITHM_ROTATING_ZERO_RESTART:
1875 /* Exactly the same as RIGHT_ASYMMETRIC, but or
1876 * of blocks for computing Q is different.
1878 pd_idx = sector_div(stripe2, raid_disks);
1879 qd_idx = pd_idx + 1;
1880 if (pd_idx == raid_disks-1) {
1881 (*dd_idx)++; /* Q D D D P */
1883 } else if (*dd_idx >= pd_idx)
1884 (*dd_idx) += 2; /* D D P Q D */
1888 case ALGORITHM_ROTATING_N_RESTART:
1889 /* Same a left_asymmetric, by first stripe is
1890 * D D D P Q rather than
1894 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
1895 qd_idx = pd_idx + 1;
1896 if (pd_idx == raid_disks-1) {
1897 (*dd_idx)++; /* Q D D D P */
1899 } else if (*dd_idx >= pd_idx)
1900 (*dd_idx) += 2; /* D D P Q D */
1904 case ALGORITHM_ROTATING_N_CONTINUE:
1905 /* Same as left_symmetric but Q is before P */
1906 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
1907 qd_idx = (pd_idx + raid_disks - 1) % raid_disks;
1908 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
1912 case ALGORITHM_LEFT_ASYMMETRIC_6:
1913 /* RAID5 left_asymmetric, with Q on last device */
1914 pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
1915 if (*dd_idx >= pd_idx)
1917 qd_idx = raid_disks - 1;
1920 case ALGORITHM_RIGHT_ASYMMETRIC_6:
1921 pd_idx = sector_div(stripe2, raid_disks-1);
1922 if (*dd_idx >= pd_idx)
1924 qd_idx = raid_disks - 1;
1927 case ALGORITHM_LEFT_SYMMETRIC_6:
1928 pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
1929 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
1930 qd_idx = raid_disks - 1;
1933 case ALGORITHM_RIGHT_SYMMETRIC_6:
1934 pd_idx = sector_div(stripe2, raid_disks-1);
1935 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
1936 qd_idx = raid_disks - 1;
1939 case ALGORITHM_PARITY_0_6:
1942 qd_idx = raid_disks - 1;
1952 sh->pd_idx = pd_idx;
1953 sh->qd_idx = qd_idx;
1954 sh->ddf_layout = ddf_layout;
1957 * Finally, compute the new sector number
1959 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
1964 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
1966 raid5_conf_t *conf = sh->raid_conf;
1967 int raid_disks = sh->disks;
1968 int data_disks = raid_disks - conf->max_degraded;
1969 sector_t new_sector = sh->sector, check;
1970 int sectors_per_chunk = previous ? conf->prev_chunk_sectors
1971 : conf->chunk_sectors;
1972 int algorithm = previous ? conf->prev_algo
1976 sector_t chunk_number;
1977 int dummy1, dd_idx = i;
1979 struct stripe_head sh2;
1982 chunk_offset = sector_div(new_sector, sectors_per_chunk);
1983 stripe = new_sector;
1985 if (i == sh->pd_idx)
1987 switch(conf->level) {
1990 switch (algorithm) {
1991 case ALGORITHM_LEFT_ASYMMETRIC:
1992 case ALGORITHM_RIGHT_ASYMMETRIC:
1996 case ALGORITHM_LEFT_SYMMETRIC:
1997 case ALGORITHM_RIGHT_SYMMETRIC:
2000 i -= (sh->pd_idx + 1);
2002 case ALGORITHM_PARITY_0:
2005 case ALGORITHM_PARITY_N:
2012 if (i == sh->qd_idx)
2013 return 0; /* It is the Q disk */
2014 switch (algorithm) {
2015 case ALGORITHM_LEFT_ASYMMETRIC:
2016 case ALGORITHM_RIGHT_ASYMMETRIC:
2017 case ALGORITHM_ROTATING_ZERO_RESTART:
2018 case ALGORITHM_ROTATING_N_RESTART:
2019 if (sh->pd_idx == raid_disks-1)
2020 i--; /* Q D D D P */
2021 else if (i > sh->pd_idx)
2022 i -= 2; /* D D P Q D */
2024 case ALGORITHM_LEFT_SYMMETRIC:
2025 case ALGORITHM_RIGHT_SYMMETRIC:
2026 if (sh->pd_idx == raid_disks-1)
2027 i--; /* Q D D D P */
2032 i -= (sh->pd_idx + 2);
2035 case ALGORITHM_PARITY_0:
2038 case ALGORITHM_PARITY_N:
2040 case ALGORITHM_ROTATING_N_CONTINUE:
2041 /* Like left_symmetric, but P is before Q */
2042 if (sh->pd_idx == 0)
2043 i--; /* P D D D Q */
2048 i -= (sh->pd_idx + 1);
2051 case ALGORITHM_LEFT_ASYMMETRIC_6:
2052 case ALGORITHM_RIGHT_ASYMMETRIC_6:
2056 case ALGORITHM_LEFT_SYMMETRIC_6:
2057 case ALGORITHM_RIGHT_SYMMETRIC_6:
2059 i += data_disks + 1;
2060 i -= (sh->pd_idx + 1);
2062 case ALGORITHM_PARITY_0_6:
2071 chunk_number = stripe * data_disks + i;
2072 r_sector = chunk_number * sectors_per_chunk + chunk_offset;
2074 check = raid5_compute_sector(conf, r_sector,
2075 previous, &dummy1, &sh2);
2076 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
2077 || sh2.qd_idx != sh->qd_idx) {
2078 printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n",
2079 mdname(conf->mddev));
2087 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
2088 int rcw, int expand)
2090 int i, pd_idx = sh->pd_idx, disks = sh->disks;
2091 raid5_conf_t *conf = sh->raid_conf;
2092 int level = conf->level;
2095 /* if we are not expanding this is a proper write request, and
2096 * there will be bios with new data to be drained into the
2100 sh->reconstruct_state = reconstruct_state_drain_run;
2101 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
2103 sh->reconstruct_state = reconstruct_state_run;
2105 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
2107 for (i = disks; i--; ) {
2108 struct r5dev *dev = &sh->dev[i];
2111 set_bit(R5_LOCKED, &dev->flags);
2112 set_bit(R5_Wantdrain, &dev->flags);
2114 clear_bit(R5_UPTODATE, &dev->flags);
2118 if (s->locked + conf->max_degraded == disks)
2119 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
2120 atomic_inc(&conf->pending_full_writes);
2123 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
2124 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
2126 sh->reconstruct_state = reconstruct_state_prexor_drain_run;
2127 set_bit(STRIPE_OP_PREXOR, &s->ops_request);
2128 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
2129 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
2131 for (i = disks; i--; ) {
2132 struct r5dev *dev = &sh->dev[i];
2137 (test_bit(R5_UPTODATE, &dev->flags) ||
2138 test_bit(R5_Wantcompute, &dev->flags))) {
2139 set_bit(R5_Wantdrain, &dev->flags);
2140 set_bit(R5_LOCKED, &dev->flags);
2141 clear_bit(R5_UPTODATE, &dev->flags);
2147 /* keep the parity disk(s) locked while asynchronous operations
2150 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
2151 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
2155 int qd_idx = sh->qd_idx;
2156 struct r5dev *dev = &sh->dev[qd_idx];
2158 set_bit(R5_LOCKED, &dev->flags);
2159 clear_bit(R5_UPTODATE, &dev->flags);
2163 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
2164 __func__, (unsigned long long)sh->sector,
2165 s->locked, s->ops_request);
2169 * Each stripe/dev can have one or more bion attached.
2170 * toread/towrite point to the first in a chain.
2171 * The bi_next chain must be in order.
2173 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
2176 raid5_conf_t *conf = sh->raid_conf;
2179 pr_debug("adding bi b#%llu to stripe s#%llu\n",
2180 (unsigned long long)bi->bi_sector,
2181 (unsigned long long)sh->sector);
2184 spin_lock_irq(&conf->device_lock);
2186 bip = &sh->dev[dd_idx].towrite;
2187 if (*bip == NULL && sh->dev[dd_idx].written == NULL)
2190 bip = &sh->dev[dd_idx].toread;
2191 while (*bip && (*bip)->bi_sector < bi->bi_sector) {
2192 if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
2194 bip = & (*bip)->bi_next;
2196 if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9))
2199 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
2203 bi->bi_phys_segments++;
2206 /* check if page is covered */
2207 sector_t sector = sh->dev[dd_idx].sector;
2208 for (bi=sh->dev[dd_idx].towrite;
2209 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
2210 bi && bi->bi_sector <= sector;
2211 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
2212 if (bi->bi_sector + (bi->bi_size>>9) >= sector)
2213 sector = bi->bi_sector + (bi->bi_size>>9);
2215 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
2216 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
2218 spin_unlock_irq(&conf->device_lock);
2220 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
2221 (unsigned long long)(*bip)->bi_sector,
2222 (unsigned long long)sh->sector, dd_idx);
2224 if (conf->mddev->bitmap && firstwrite) {
2225 bitmap_startwrite(conf->mddev->bitmap, sh->sector,
2227 sh->bm_seq = conf->seq_flush+1;
2228 set_bit(STRIPE_BIT_DELAY, &sh->state);
2233 set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
2234 spin_unlock_irq(&conf->device_lock);
2238 static void end_reshape(raid5_conf_t *conf);
2240 static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
2241 struct stripe_head *sh)
2243 int sectors_per_chunk =
2244 previous ? conf->prev_chunk_sectors : conf->chunk_sectors;
2246 int chunk_offset = sector_div(stripe, sectors_per_chunk);
2247 int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
2249 raid5_compute_sector(conf,
2250 stripe * (disks - conf->max_degraded)
2251 *sectors_per_chunk + chunk_offset,
2257 handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
2258 struct stripe_head_state *s, int disks,
2259 struct bio **return_bi)
2262 for (i = disks; i--; ) {
2266 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
2269 rdev = rcu_dereference(conf->disks[i].rdev);
2270 if (rdev && test_bit(In_sync, &rdev->flags))
2271 atomic_inc(&rdev->nr_pending);
2276 if (!rdev_set_badblocks(
2280 md_error(conf->mddev, rdev);
2281 rdev_dec_pending(rdev, conf->mddev);
2284 spin_lock_irq(&conf->device_lock);
2285 /* fail all writes first */
2286 bi = sh->dev[i].towrite;
2287 sh->dev[i].towrite = NULL;
2293 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2294 wake_up(&conf->wait_for_overlap);
2296 while (bi && bi->bi_sector <
2297 sh->dev[i].sector + STRIPE_SECTORS) {
2298 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
2299 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2300 if (!raid5_dec_bi_phys_segments(bi)) {
2301 md_write_end(conf->mddev);
2302 bi->bi_next = *return_bi;
2307 /* and fail all 'written' */
2308 bi = sh->dev[i].written;
2309 sh->dev[i].written = NULL;
2310 if (bi) bitmap_end = 1;
2311 while (bi && bi->bi_sector <
2312 sh->dev[i].sector + STRIPE_SECTORS) {
2313 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
2314 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2315 if (!raid5_dec_bi_phys_segments(bi)) {
2316 md_write_end(conf->mddev);
2317 bi->bi_next = *return_bi;
2323 /* fail any reads if this device is non-operational and
2324 * the data has not reached the cache yet.
2326 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
2327 (!test_bit(R5_Insync, &sh->dev[i].flags) ||
2328 test_bit(R5_ReadError, &sh->dev[i].flags))) {
2329 bi = sh->dev[i].toread;
2330 sh->dev[i].toread = NULL;
2331 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2332 wake_up(&conf->wait_for_overlap);
2333 if (bi) s->to_read--;
2334 while (bi && bi->bi_sector <
2335 sh->dev[i].sector + STRIPE_SECTORS) {
2336 struct bio *nextbi =
2337 r5_next_bio(bi, sh->dev[i].sector);
2338 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2339 if (!raid5_dec_bi_phys_segments(bi)) {
2340 bi->bi_next = *return_bi;
2346 spin_unlock_irq(&conf->device_lock);
2348 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2349 STRIPE_SECTORS, 0, 0);
2350 /* If we were in the middle of a write the parity block might
2351 * still be locked - so just clear all R5_LOCKED flags
2353 clear_bit(R5_LOCKED, &sh->dev[i].flags);
2356 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
2357 if (atomic_dec_and_test(&conf->pending_full_writes))
2358 md_wakeup_thread(conf->mddev->thread);
2362 handle_failed_sync(raid5_conf_t *conf, struct stripe_head *sh,
2363 struct stripe_head_state *s)
2368 md_done_sync(conf->mddev, STRIPE_SECTORS, 0);
2369 clear_bit(STRIPE_SYNCING, &sh->state);
2371 /* There is nothing more to do for sync/check/repair.
2372 * For recover we need to record a bad block on all
2373 * non-sync devices, or abort the recovery
2375 if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery))
2377 /* During recovery devices cannot be removed, so locking and
2378 * refcounting of rdevs is not needed
2380 for (i = 0; i < conf->raid_disks; i++) {
2381 mdk_rdev_t *rdev = conf->disks[i].rdev;
2383 || test_bit(Faulty, &rdev->flags)
2384 || test_bit(In_sync, &rdev->flags))
2386 if (!rdev_set_badblocks(rdev, sh->sector,
2391 conf->recovery_disabled = conf->mddev->recovery_disabled;
2392 set_bit(MD_RECOVERY_INTR, &conf->mddev->recovery);
2396 /* fetch_block - checks the given member device to see if its data needs
2397 * to be read or computed to satisfy a request.
2399 * Returns 1 when no more member devices need to be checked, otherwise returns
2400 * 0 to tell the loop in handle_stripe_fill to continue
2402 static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
2403 int disk_idx, int disks)
2405 struct r5dev *dev = &sh->dev[disk_idx];
2406 struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
2407 &sh->dev[s->failed_num[1]] };
2409 /* is the data in this block needed, and can we get it? */
2410 if (!test_bit(R5_LOCKED, &dev->flags) &&
2411 !test_bit(R5_UPTODATE, &dev->flags) &&
2413 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2414 s->syncing || s->expanding ||
2415 (s->failed >= 1 && fdev[0]->toread) ||
2416 (s->failed >= 2 && fdev[1]->toread) ||
2417 (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite &&
2418 !test_bit(R5_OVERWRITE, &fdev[0]->flags)) ||
2419 (sh->raid_conf->level == 6 && s->failed && s->to_write))) {
2420 /* we would like to get this block, possibly by computing it,
2421 * otherwise read it if the backing disk is insync
2423 BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
2424 BUG_ON(test_bit(R5_Wantread, &dev->flags));
2425 if ((s->uptodate == disks - 1) &&
2426 (s->failed && (disk_idx == s->failed_num[0] ||
2427 disk_idx == s->failed_num[1]))) {
2428 /* have disk failed, and we're requested to fetch it;
2431 pr_debug("Computing stripe %llu block %d\n",
2432 (unsigned long long)sh->sector, disk_idx);
2433 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2434 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2435 set_bit(R5_Wantcompute, &dev->flags);
2436 sh->ops.target = disk_idx;
2437 sh->ops.target2 = -1; /* no 2nd target */
2439 /* Careful: from this point on 'uptodate' is in the eye
2440 * of raid_run_ops which services 'compute' operations
2441 * before writes. R5_Wantcompute flags a block that will
2442 * be R5_UPTODATE by the time it is needed for a
2443 * subsequent operation.
2447 } else if (s->uptodate == disks-2 && s->failed >= 2) {
2448 /* Computing 2-failure is *very* expensive; only
2449 * do it if failed >= 2
2452 for (other = disks; other--; ) {
2453 if (other == disk_idx)
2455 if (!test_bit(R5_UPTODATE,
2456 &sh->dev[other].flags))
2460 pr_debug("Computing stripe %llu blocks %d,%d\n",
2461 (unsigned long long)sh->sector,
2463 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2464 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2465 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags);
2466 set_bit(R5_Wantcompute, &sh->dev[other].flags);
2467 sh->ops.target = disk_idx;
2468 sh->ops.target2 = other;
2472 } else if (test_bit(R5_Insync, &dev->flags)) {
2473 set_bit(R5_LOCKED, &dev->flags);
2474 set_bit(R5_Wantread, &dev->flags);
2476 pr_debug("Reading block %d (sync=%d)\n",
2477 disk_idx, s->syncing);
2485 * handle_stripe_fill - read or compute data to satisfy pending requests.
2487 static void handle_stripe_fill(struct stripe_head *sh,
2488 struct stripe_head_state *s,
2493 /* look for blocks to read/compute, skip this if a compute
2494 * is already in flight, or if the stripe contents are in the
2495 * midst of changing due to a write
2497 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
2498 !sh->reconstruct_state)
2499 for (i = disks; i--; )
2500 if (fetch_block(sh, s, i, disks))
2502 set_bit(STRIPE_HANDLE, &sh->state);
2506 /* handle_stripe_clean_event
2507 * any written block on an uptodate or failed drive can be returned.
2508 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
2509 * never LOCKED, so we don't need to test 'failed' directly.
2511 static void handle_stripe_clean_event(raid5_conf_t *conf,
2512 struct stripe_head *sh, int disks, struct bio **return_bi)
2517 for (i = disks; i--; )
2518 if (sh->dev[i].written) {
2520 if (!test_bit(R5_LOCKED, &dev->flags) &&
2521 test_bit(R5_UPTODATE, &dev->flags)) {
2522 /* We can return any write requests */
2523 struct bio *wbi, *wbi2;
2525 pr_debug("Return write for disc %d\n", i);
2526 spin_lock_irq(&conf->device_lock);
2528 dev->written = NULL;
2529 while (wbi && wbi->bi_sector <
2530 dev->sector + STRIPE_SECTORS) {
2531 wbi2 = r5_next_bio(wbi, dev->sector);
2532 if (!raid5_dec_bi_phys_segments(wbi)) {
2533 md_write_end(conf->mddev);
2534 wbi->bi_next = *return_bi;
2539 if (dev->towrite == NULL)
2541 spin_unlock_irq(&conf->device_lock);
2543 bitmap_endwrite(conf->mddev->bitmap,
2546 !test_bit(STRIPE_DEGRADED, &sh->state),
2551 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
2552 if (atomic_dec_and_test(&conf->pending_full_writes))
2553 md_wakeup_thread(conf->mddev->thread);
2556 static void handle_stripe_dirtying(raid5_conf_t *conf,
2557 struct stripe_head *sh,
2558 struct stripe_head_state *s,
2561 int rmw = 0, rcw = 0, i;
2562 if (conf->max_degraded == 2) {
2563 /* RAID6 requires 'rcw' in current implementation
2564 * Calculate the real rcw later - for now fake it
2565 * look like rcw is cheaper
2568 } else for (i = disks; i--; ) {
2569 /* would I have to read this buffer for read_modify_write */
2570 struct r5dev *dev = &sh->dev[i];
2571 if ((dev->towrite || i == sh->pd_idx) &&
2572 !test_bit(R5_LOCKED, &dev->flags) &&
2573 !(test_bit(R5_UPTODATE, &dev->flags) ||
2574 test_bit(R5_Wantcompute, &dev->flags))) {
2575 if (test_bit(R5_Insync, &dev->flags))
2578 rmw += 2*disks; /* cannot read it */
2580 /* Would I have to read this buffer for reconstruct_write */
2581 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
2582 !test_bit(R5_LOCKED, &dev->flags) &&
2583 !(test_bit(R5_UPTODATE, &dev->flags) ||
2584 test_bit(R5_Wantcompute, &dev->flags))) {
2585 if (test_bit(R5_Insync, &dev->flags)) rcw++;
2590 pr_debug("for sector %llu, rmw=%d rcw=%d\n",
2591 (unsigned long long)sh->sector, rmw, rcw);
2592 set_bit(STRIPE_HANDLE, &sh->state);
2593 if (rmw < rcw && rmw > 0)
2594 /* prefer read-modify-write, but need to get some data */
2595 for (i = disks; i--; ) {
2596 struct r5dev *dev = &sh->dev[i];
2597 if ((dev->towrite || i == sh->pd_idx) &&
2598 !test_bit(R5_LOCKED, &dev->flags) &&
2599 !(test_bit(R5_UPTODATE, &dev->flags) ||
2600 test_bit(R5_Wantcompute, &dev->flags)) &&
2601 test_bit(R5_Insync, &dev->flags)) {
2603 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2604 pr_debug("Read_old block "
2605 "%d for r-m-w\n", i);
2606 set_bit(R5_LOCKED, &dev->flags);
2607 set_bit(R5_Wantread, &dev->flags);
2610 set_bit(STRIPE_DELAYED, &sh->state);
2611 set_bit(STRIPE_HANDLE, &sh->state);
2615 if (rcw <= rmw && rcw > 0) {
2616 /* want reconstruct write, but need to get some data */
2618 for (i = disks; i--; ) {
2619 struct r5dev *dev = &sh->dev[i];
2620 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
2621 i != sh->pd_idx && i != sh->qd_idx &&
2622 !test_bit(R5_LOCKED, &dev->flags) &&
2623 !(test_bit(R5_UPTODATE, &dev->flags) ||
2624 test_bit(R5_Wantcompute, &dev->flags))) {
2626 if (!test_bit(R5_Insync, &dev->flags))
2627 continue; /* it's a failed drive */
2629 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2630 pr_debug("Read_old block "
2631 "%d for Reconstruct\n", i);
2632 set_bit(R5_LOCKED, &dev->flags);
2633 set_bit(R5_Wantread, &dev->flags);
2636 set_bit(STRIPE_DELAYED, &sh->state);
2637 set_bit(STRIPE_HANDLE, &sh->state);
2642 /* now if nothing is locked, and if we have enough data,
2643 * we can start a write request
2645 /* since handle_stripe can be called at any time we need to handle the
2646 * case where a compute block operation has been submitted and then a
2647 * subsequent call wants to start a write request. raid_run_ops only
2648 * handles the case where compute block and reconstruct are requested
2649 * simultaneously. If this is not the case then new writes need to be
2650 * held off until the compute completes.
2652 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
2653 (s->locked == 0 && (rcw == 0 || rmw == 0) &&
2654 !test_bit(STRIPE_BIT_DELAY, &sh->state)))
2655 schedule_reconstruction(sh, s, rcw == 0, 0);
2658 static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2659 struct stripe_head_state *s, int disks)
2661 struct r5dev *dev = NULL;
2663 set_bit(STRIPE_HANDLE, &sh->state);
2665 switch (sh->check_state) {
2666 case check_state_idle:
2667 /* start a new check operation if there are no failures */
2668 if (s->failed == 0) {
2669 BUG_ON(s->uptodate != disks);
2670 sh->check_state = check_state_run;
2671 set_bit(STRIPE_OP_CHECK, &s->ops_request);
2672 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
2676 dev = &sh->dev[s->failed_num[0]];
2678 case check_state_compute_result:
2679 sh->check_state = check_state_idle;
2681 dev = &sh->dev[sh->pd_idx];
2683 /* check that a write has not made the stripe insync */
2684 if (test_bit(STRIPE_INSYNC, &sh->state))
2687 /* either failed parity check, or recovery is happening */
2688 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
2689 BUG_ON(s->uptodate != disks);
2691 set_bit(R5_LOCKED, &dev->flags);
2693 set_bit(R5_Wantwrite, &dev->flags);
2695 clear_bit(STRIPE_DEGRADED, &sh->state);
2696 set_bit(STRIPE_INSYNC, &sh->state);
2698 case check_state_run:
2699 break; /* we will be called again upon completion */
2700 case check_state_check_result:
2701 sh->check_state = check_state_idle;
2703 /* if a failure occurred during the check operation, leave
2704 * STRIPE_INSYNC not set and let the stripe be handled again
2709 /* handle a successful check operation, if parity is correct
2710 * we are done. Otherwise update the mismatch count and repair
2711 * parity if !MD_RECOVERY_CHECK
2713 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0)
2714 /* parity is correct (on disc,
2715 * not in buffer any more)
2717 set_bit(STRIPE_INSYNC, &sh->state);
2719 conf->mddev->resync_mismatches += STRIPE_SECTORS;
2720 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2721 /* don't try to repair!! */
2722 set_bit(STRIPE_INSYNC, &sh->state);
2724 sh->check_state = check_state_compute_run;
2725 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2726 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2727 set_bit(R5_Wantcompute,
2728 &sh->dev[sh->pd_idx].flags);
2729 sh->ops.target = sh->pd_idx;
2730 sh->ops.target2 = -1;
2735 case check_state_compute_run:
2738 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
2739 __func__, sh->check_state,
2740 (unsigned long long) sh->sector);
2746 static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2747 struct stripe_head_state *s,
2750 int pd_idx = sh->pd_idx;
2751 int qd_idx = sh->qd_idx;
2754 set_bit(STRIPE_HANDLE, &sh->state);
2756 BUG_ON(s->failed > 2);
2758 /* Want to check and possibly repair P and Q.
2759 * However there could be one 'failed' device, in which
2760 * case we can only check one of them, possibly using the
2761 * other to generate missing data
2764 switch (sh->check_state) {
2765 case check_state_idle:
2766 /* start a new check operation if there are < 2 failures */
2767 if (s->failed == s->q_failed) {
2768 /* The only possible failed device holds Q, so it
2769 * makes sense to check P (If anything else were failed,
2770 * we would have used P to recreate it).
2772 sh->check_state = check_state_run;
2774 if (!s->q_failed && s->failed < 2) {
2775 /* Q is not failed, and we didn't use it to generate
2776 * anything, so it makes sense to check it
2778 if (sh->check_state == check_state_run)
2779 sh->check_state = check_state_run_pq;
2781 sh->check_state = check_state_run_q;
2784 /* discard potentially stale zero_sum_result */
2785 sh->ops.zero_sum_result = 0;
2787 if (sh->check_state == check_state_run) {
2788 /* async_xor_zero_sum destroys the contents of P */
2789 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
2792 if (sh->check_state >= check_state_run &&
2793 sh->check_state <= check_state_run_pq) {
2794 /* async_syndrome_zero_sum preserves P and Q, so
2795 * no need to mark them !uptodate here
2797 set_bit(STRIPE_OP_CHECK, &s->ops_request);
2801 /* we have 2-disk failure */
2802 BUG_ON(s->failed != 2);
2804 case check_state_compute_result:
2805 sh->check_state = check_state_idle;
2807 /* check that a write has not made the stripe insync */
2808 if (test_bit(STRIPE_INSYNC, &sh->state))
2811 /* now write out any block on a failed drive,
2812 * or P or Q if they were recomputed
2814 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */
2815 if (s->failed == 2) {
2816 dev = &sh->dev[s->failed_num[1]];
2818 set_bit(R5_LOCKED, &dev->flags);
2819 set_bit(R5_Wantwrite, &dev->flags);
2821 if (s->failed >= 1) {
2822 dev = &sh->dev[s->failed_num[0]];
2824 set_bit(R5_LOCKED, &dev->flags);
2825 set_bit(R5_Wantwrite, &dev->flags);
2827 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
2828 dev = &sh->dev[pd_idx];
2830 set_bit(R5_LOCKED, &dev->flags);
2831 set_bit(R5_Wantwrite, &dev->flags);
2833 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
2834 dev = &sh->dev[qd_idx];
2836 set_bit(R5_LOCKED, &dev->flags);
2837 set_bit(R5_Wantwrite, &dev->flags);
2839 clear_bit(STRIPE_DEGRADED, &sh->state);
2841 set_bit(STRIPE_INSYNC, &sh->state);
2843 case check_state_run:
2844 case check_state_run_q:
2845 case check_state_run_pq:
2846 break; /* we will be called again upon completion */
2847 case check_state_check_result:
2848 sh->check_state = check_state_idle;
2850 /* handle a successful check operation, if parity is correct
2851 * we are done. Otherwise update the mismatch count and repair
2852 * parity if !MD_RECOVERY_CHECK
2854 if (sh->ops.zero_sum_result == 0) {
2855 /* both parities are correct */
2857 set_bit(STRIPE_INSYNC, &sh->state);
2859 /* in contrast to the raid5 case we can validate
2860 * parity, but still have a failure to write
2863 sh->check_state = check_state_compute_result;
2864 /* Returning at this point means that we may go
2865 * off and bring p and/or q uptodate again so
2866 * we make sure to check zero_sum_result again
2867 * to verify if p or q need writeback
2871 conf->mddev->resync_mismatches += STRIPE_SECTORS;
2872 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2873 /* don't try to repair!! */
2874 set_bit(STRIPE_INSYNC, &sh->state);
2876 int *target = &sh->ops.target;
2878 sh->ops.target = -1;
2879 sh->ops.target2 = -1;
2880 sh->check_state = check_state_compute_run;
2881 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2882 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2883 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
2884 set_bit(R5_Wantcompute,
2885 &sh->dev[pd_idx].flags);
2887 target = &sh->ops.target2;
2890 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
2891 set_bit(R5_Wantcompute,
2892 &sh->dev[qd_idx].flags);
2899 case check_state_compute_run:
2902 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
2903 __func__, sh->check_state,
2904 (unsigned long long) sh->sector);
2909 static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh)
2913 /* We have read all the blocks in this stripe and now we need to
2914 * copy some of them into a target stripe for expand.
2916 struct dma_async_tx_descriptor *tx = NULL;
2917 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2918 for (i = 0; i < sh->disks; i++)
2919 if (i != sh->pd_idx && i != sh->qd_idx) {
2921 struct stripe_head *sh2;
2922 struct async_submit_ctl submit;
2924 sector_t bn = compute_blocknr(sh, i, 1);
2925 sector_t s = raid5_compute_sector(conf, bn, 0,
2927 sh2 = get_active_stripe(conf, s, 0, 1, 1);
2929 /* so far only the early blocks of this stripe
2930 * have been requested. When later blocks
2931 * get requested, we will try again
2934 if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
2935 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
2936 /* must have already done this block */
2937 release_stripe(sh2);
2941 /* place all the copies on one channel */
2942 init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
2943 tx = async_memcpy(sh2->dev[dd_idx].page,
2944 sh->dev[i].page, 0, 0, STRIPE_SIZE,
2947 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
2948 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
2949 for (j = 0; j < conf->raid_disks; j++)
2950 if (j != sh2->pd_idx &&
2952 !test_bit(R5_Expanded, &sh2->dev[j].flags))
2954 if (j == conf->raid_disks) {
2955 set_bit(STRIPE_EXPAND_READY, &sh2->state);
2956 set_bit(STRIPE_HANDLE, &sh2->state);
2958 release_stripe(sh2);
2961 /* done submitting copies, wait for them to complete */
2964 dma_wait_for_async_tx(tx);
2970 * handle_stripe - do things to a stripe.
2972 * We lock the stripe and then examine the state of various bits
2973 * to see what needs to be done.
2975 * return some read request which now have data
2976 * return some write requests which are safely on disc
2977 * schedule a read on some buffers
2978 * schedule a write of some buffers
2979 * return confirmation of parity correctness
2981 * buffers are taken off read_list or write_list, and bh_cache buffers
2982 * get BH_Lock set before the stripe lock is released.
2986 static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
2988 raid5_conf_t *conf = sh->raid_conf;
2989 int disks = sh->disks;
2993 memset(s, 0, sizeof(*s));
2995 s->syncing = test_bit(STRIPE_SYNCING, &sh->state);
2996 s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2997 s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
2998 s->failed_num[0] = -1;
2999 s->failed_num[1] = -1;
3001 /* Now to look around and see what can be done */
3003 spin_lock_irq(&conf->device_lock);
3004 for (i=disks; i--; ) {
3012 pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
3013 i, dev->flags, dev->toread, dev->towrite, dev->written);
3014 /* maybe we can reply to a read
3016 * new wantfill requests are only permitted while
3017 * ops_complete_biofill is guaranteed to be inactive
3019 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
3020 !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
3021 set_bit(R5_Wantfill, &dev->flags);
3023 /* now count some things */
3024 if (test_bit(R5_LOCKED, &dev->flags))
3026 if (test_bit(R5_UPTODATE, &dev->flags))
3028 if (test_bit(R5_Wantcompute, &dev->flags)) {
3030 BUG_ON(s->compute > 2);
3033 if (test_bit(R5_Wantfill, &dev->flags))
3035 else if (dev->toread)
3039 if (!test_bit(R5_OVERWRITE, &dev->flags))
3044 rdev = rcu_dereference(conf->disks[i].rdev);
3046 is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
3047 &first_bad, &bad_sectors);
3048 if (s->blocked_rdev == NULL
3049 && (test_bit(Blocked, &rdev->flags)
3052 set_bit(BlockedBadBlocks,
3054 s->blocked_rdev = rdev;
3055 atomic_inc(&rdev->nr_pending);
3058 clear_bit(R5_Insync, &dev->flags);
3062 /* also not in-sync */
3063 if (!test_bit(WriteErrorSeen, &rdev->flags)) {
3064 /* treat as in-sync, but with a read error
3065 * which we can now try to correct
3067 set_bit(R5_Insync, &dev->flags);
3068 set_bit(R5_ReadError, &dev->flags);
3070 } else if (test_bit(In_sync, &rdev->flags))
3071 set_bit(R5_Insync, &dev->flags);
3073 /* in sync if before recovery_offset */
3074 if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
3075 set_bit(R5_Insync, &dev->flags);
3077 if (test_bit(R5_WriteError, &dev->flags)) {
3078 clear_bit(R5_Insync, &dev->flags);
3079 if (!test_bit(Faulty, &rdev->flags)) {
3080 s->handle_bad_blocks = 1;
3081 atomic_inc(&rdev->nr_pending);
3083 clear_bit(R5_WriteError, &dev->flags);
3085 if (test_bit(R5_MadeGood, &dev->flags)) {