Merge git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-2.6-dm
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 3 Aug 2011 06:49:21 +0000 (20:49 -1000)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 3 Aug 2011 06:49:21 +0000 (20:49 -1000)
* git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-2.6-dm: (34 commits)
  dm table: set flush capability based on underlying devices
  dm crypt: optionally support discard requests
  dm raid: add md raid1 support
  dm raid: support metadata devices
  dm raid: add write_mostly parameter
  dm raid: add region_size parameter
  dm raid: improve table parameters documentation
  dm ioctl: forbid multiple device specifiers
  dm ioctl: introduce __get_dev_cell
  dm ioctl: fill in device parameters in more ioctls
  dm flakey: add corrupt_bio_byte feature
  dm flakey: add drop_writes
  dm flakey: support feature args
  dm flakey: use dm_target_offset and support discards
  dm table: share target argument parsing functions
  dm snapshot: skip reading origin when overwriting complete chunk
  dm: ignore merge_bvec for snapshots when safe
  dm table: clean dm_get_device and move exports
  dm raid: tidy includes
  dm ioctl: prevent empty message
  ...

21 files changed:
Documentation/device-mapper/dm-crypt.txt
Documentation/device-mapper/dm-flakey.txt
Documentation/device-mapper/dm-raid.txt
drivers/md/Kconfig
drivers/md/dm-crypt.c
drivers/md/dm-flakey.c
drivers/md/dm-io.c
drivers/md/dm-ioctl.c
drivers/md/dm-kcopyd.c
drivers/md/dm-log-userspace-base.c
drivers/md/dm-log.c
drivers/md/dm-mpath.c
drivers/md/dm-raid.c
drivers/md/dm-snap-persistent.c
drivers/md/dm-snap.c
drivers/md/dm-table.c
drivers/md/dm.c
drivers/md/dm.h
include/linux/device-mapper.h
include/linux/dm-ioctl.h
include/linux/dm-kcopyd.h

index 6b5c42dbbe841c8ec6ed1b608ac3bd77ffe60fa1..2c656ae43ba7f9907571c181ad4800ea5ceeaa9f 100644 (file)
@@ -4,7 +4,8 @@ dm-crypt
 Device-Mapper's "crypt" target provides transparent encryption of block devices
 using the kernel crypto API.
 
-Parameters: <cipher> <key> <iv_offset> <device path> <offset>
+Parameters: <cipher> <key> <iv_offset> <device path> \
+             <offset> [<#opt_params> <opt_params>]
 
 <cipher>
     Encryption cipher and an optional IV generation mode.
@@ -37,6 +38,24 @@ Parameters: <cipher> <key> <iv_offset> <device path> <offset>
 <offset>
     Starting sector within the device where the encrypted data begins.
 
+<#opt_params>
+    Number of optional parameters. If there are no optional parameters,
+    the optional paramaters section can be skipped or #opt_params can be zero.
+    Otherwise #opt_params is the number of following arguments.
+
+    Example of optional parameters section:
+        1 allow_discards
+
+allow_discards
+    Block discard requests (a.k.a. TRIM) are passed through the crypt device.
+    The default is to ignore discard requests.
+
+    WARNING: Assess the specific security risks carefully before enabling this
+    option.  For example, allowing discards on encrypted devices may lead to
+    the leak of information about the ciphertext device (filesystem type,
+    used space etc.) if the discarded blocks can be located easily on the
+    device later.
+
 Example scripts
 ===============
 LUKS (Linux Unified Key Setup) is now the preferred way to set up disk
index c8efdfd19a655d02bd14f67d41abf45e16d1e9fd..6ff5c2327227f2040e43ec2b74f97a2bd74ca11f 100644 (file)
@@ -1,17 +1,53 @@
 dm-flakey
 =========
 
-This target is the same as the linear target except that it returns I/O
-errors periodically.  It's been found useful in simulating failing
-devices for testing purposes.
+This target is the same as the linear target except that it exhibits
+unreliable behaviour periodically.  It's been found useful in simulating
+failing devices for testing purposes.
 
 Starting from the time the table is loaded, the device is available for
-<up interval> seconds, then returns errors for <down interval> seconds,
-and then this cycle repeats.
+<up interval> seconds, then exhibits unreliable behaviour for <down
+interval> seconds, and then this cycle repeats.
 
-Parameters: <dev path> <offset> <up interval> <down interval>
+Also, consider using this in combination with the dm-delay target too,
+which can delay reads and writes and/or send them to different
+underlying devices.
+
+Table parameters
+----------------
+  <dev path> <offset> <up interval> <down interval> \
+    [<num_features> [<feature arguments>]]
+
+Mandatory parameters:
     <dev path>: Full pathname to the underlying block-device, or a
                 "major:minor" device-number.
     <offset>: Starting sector within the device.
     <up interval>: Number of seconds device is available.
     <down interval>: Number of seconds device returns errors.
+
+Optional feature parameters:
+  If no feature parameters are present, during the periods of
+  unreliability, all I/O returns errors.
+
+  drop_writes:
+       All write I/O is silently ignored.
+       Read I/O is handled correctly.
+
+  corrupt_bio_byte <Nth_byte> <direction> <value> <flags>:
+       During <down interval>, replace <Nth_byte> of the data of
+       each matching bio with <value>.
+
+    <Nth_byte>: The offset of the byte to replace.
+               Counting starts at 1, to replace the first byte.
+    <direction>: Either 'r' to corrupt reads or 'w' to corrupt writes.
+                'w' is incompatible with drop_writes.
+    <value>: The value (from 0-255) to write.
+    <flags>: Perform the replacement only if bio->bi_rw has all the
+            selected flags set.
+
+Examples:
+  corrupt_bio_byte 32 r 1 0
+       - replaces the 32nd byte of READ bios with the value 1
+
+  corrupt_bio_byte 224 w 0 32
+       - replaces the 224th byte of REQ_META (=32) bios with the value 0
index 33b6b7071ac8d65e386aead1e4e6f6a916e9c502..2a8c11331d2d6e8a861d272546e8a7f7d9f02cf8 100644 (file)
-Device-mapper RAID (dm-raid) is a bridge from DM to MD.  It
-provides a way to use device-mapper interfaces to access the MD RAID
-drivers.
+dm-raid
+-------
 
-As with all device-mapper targets, the nominal public interfaces are the
-constructor (CTR) tables and the status outputs (both STATUSTYPE_INFO
-and STATUSTYPE_TABLE).  The CTR table looks like the following:
+The device-mapper RAID (dm-raid) target provides a bridge from DM to MD.
+It allows the MD RAID drivers to be accessed using a device-mapper
+interface.
 
-1: <s> <l> raid \
-2:      <raid_type> <#raid_params> <raid_params> \
-3:      <#raid_devs> <meta_dev1> <dev1> .. <meta_devN> <devN>
-
-Line 1 contains the standard first three arguments to any device-mapper
-target - the start, length, and target type fields.  The target type in
-this case is "raid".
-
-Line 2 contains the arguments that define the particular raid
-type/personality/level, the required arguments for that raid type, and
-any optional arguments.  Possible raid types include: raid4, raid5_la,
-raid5_ls, raid5_rs, raid6_zr, raid6_nr, and raid6_nc.  (raid1 is
-planned for the future.)  The list of required and optional parameters
-is the same for all the current raid types.  The required parameters are
-positional, while the optional parameters are given as key/value pairs.
-The possible parameters are as follows:
- <chunk_size>           Chunk size in sectors.
- [[no]sync]             Force/Prevent RAID initialization
- [rebuild <idx>]        Rebuild the drive indicated by the index
- [daemon_sleep <ms>]    Time between bitmap daemon work to clear bits
- [min_recovery_rate <kB/sec/disk>]      Throttle RAID initialization
- [max_recovery_rate <kB/sec/disk>]      Throttle RAID initialization
- [max_write_behind <sectors>]           See '-write-behind=' (man mdadm)
- [stripe_cache <sectors>]               Stripe cache size for higher RAIDs
-
-Line 3 contains the list of devices that compose the array in
-metadata/data device pairs.  If the metadata is stored separately, a '-'
-is given for the metadata device position.  If a drive has failed or is
-missing at creation time, a '-' can be given for both the metadata and
-data drives for a given position.
-
-NB. Currently all metadata devices must be specified as '-'.
-
-Examples:
-# RAID4 - 4 data drives, 1 parity
+The target is named "raid" and it accepts the following parameters:
+
+  <raid_type> <#raid_params> <raid_params> \
+    <#raid_devs> <metadata_dev0> <dev0> [.. <metadata_devN> <devN>]
+
+<raid_type>:
+  raid1                RAID1 mirroring
+  raid4                RAID4 dedicated parity disk
+  raid5_la     RAID5 left asymmetric
+               - rotating parity 0 with data continuation
+  raid5_ra     RAID5 right asymmetric
+               - rotating parity N with data continuation
+  raid5_ls     RAID5 left symmetric
+               - rotating parity 0 with data restart
+  raid5_rs     RAID5 right symmetric
+               - rotating parity N with data restart
+  raid6_zr     RAID6 zero restart
+               - rotating parity zero (left-to-right) with data restart
+  raid6_nr     RAID6 N restart
+               - rotating parity N (right-to-left) with data restart
+  raid6_nc     RAID6 N continue
+               - rotating parity N (right-to-left) with data continuation
+
+  Refererence: Chapter 4 of
+  http://www.snia.org/sites/default/files/SNIA_DDF_Technical_Position_v2.0.pdf
+
+<#raid_params>: The number of parameters that follow.
+
+<raid_params> consists of
+    Mandatory parameters:
+        <chunk_size>: Chunk size in sectors.  This parameter is often known as
+                     "stripe size".  It is the only mandatory parameter and
+                     is placed first.
+
+    followed by optional parameters (in any order):
+       [sync|nosync]   Force or prevent RAID initialization.
+
+       [rebuild <idx>] Rebuild drive number idx (first drive is 0).
+
+       [daemon_sleep <ms>]
+               Interval between runs of the bitmap daemon that
+               clear bits.  A longer interval means less bitmap I/O but
+               resyncing after a failure is likely to take longer.
+
+       [min_recovery_rate <kB/sec/disk>]  Throttle RAID initialization
+       [max_recovery_rate <kB/sec/disk>]  Throttle RAID initialization
+       [write_mostly <idx>]               Drive index is write-mostly
+       [max_write_behind <sectors>]       See '-write-behind=' (man mdadm)
+       [stripe_cache <sectors>]           Stripe cache size (higher RAIDs only)
+       [region_size <sectors>]
+               The region_size multiplied by the number of regions is the
+               logical size of the array.  The bitmap records the device
+               synchronisation state for each region.
+
+<#raid_devs>: The number of devices composing the array.
+       Each device consists of two entries.  The first is the device
+       containing the metadata (if any); the second is the one containing the
+       data.
+
+       If a drive has failed or is missing at creation time, a '-' can be
+       given for both the metadata and data drives for a given position.
+
+
+Example tables
+--------------
+# RAID4 - 4 data drives, 1 parity (no metadata devices)
 # No metadata devices specified to hold superblock/bitmap info
 # Chunk size of 1MiB
 # (Lines separated for easy reading)
+
 0 1960893648 raid \
         raid4 1 2048 \
         5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81
 
-# RAID4 - 4 data drives, 1 parity (no metadata devices)
+# RAID4 - 4 data drives, 1 parity (with metadata devices)
 # Chunk size of 1MiB, force RAID initialization,
 #       min recovery rate at 20 kiB/sec/disk
+
 0 1960893648 raid \
-        raid4 4 2048 min_recovery_rate 20 sync\
-        5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81
+        raid4 4 2048 sync min_recovery_rate 20 \
+        5 8:17 8:18 8:33 8:34 8:49 8:50 8:65 8:66 8:81 8:82
 
-Performing a 'dmsetup table' should display the CTR table used to
-construct the mapping (with possible reordering of optional
-parameters).
+'dmsetup table' displays the table used to construct the mapping.
+The optional parameters are always printed in the order listed
+above with "sync" or "nosync" always output ahead of the other
+arguments, regardless of the order used when originally loading the table.
+Arguments that can be repeated are ordered by value.
 
-Performing a 'dmsetup status' will yield information on the state and
-health of the array.  The output is as follows:
+'dmsetup status' yields information on the state and health of the
+array.
+The output is as follows:
 1: <s> <l> raid \
 2:      <raid_type> <#devices> <1 health char for each dev> <resync_ratio>
 
-Line 1 is standard DM output.  Line 2 is best shown by example:
+Line 1 is the standard output produced by device-mapper.
+Line 2 is produced by the raid target, and best explained by example:
         0 1960893648 raid raid4 5 AAAAA 2/490221568
 Here we can see the RAID type is raid4, there are 5 devices - all of
 which are 'A'live, and the array is 2/490221568 complete with recovery.
+Faulty or missing devices are marked 'D'.  Devices that are out-of-sync
+are marked 'a'.
index 8420129fc5eed67693b99812ab565130b8ad1db7..f75a66e7d312a8e1efed3d2db37ba1c554785e94 100644 (file)
@@ -241,12 +241,13 @@ config DM_MIRROR
          needed for live data migration tools such as 'pvmove'.
 
 config DM_RAID
-       tristate "RAID 4/5/6 target (EXPERIMENTAL)"
+       tristate "RAID 1/4/5/6 target (EXPERIMENTAL)"
        depends on BLK_DEV_DM && EXPERIMENTAL
+       select MD_RAID1
        select MD_RAID456
        select BLK_DEV_MD
        ---help---
-        A dm target that supports RAID4, RAID5 and RAID6 mappings
+        A dm target that supports RAID1, RAID4, RAID5 and RAID6 mappings
 
         A RAID-5 set of N drives with a capacity of C MB per drive provides
         the capacity of C * (N - 1) MB, and protects against a failure
index bae6c4e23d3f7d02798478993c50b976767a81ee..49da55c1528aa01137a61c98d6033c9b84dc2e05 100644 (file)
@@ -30,7 +30,6 @@
 #include <linux/device-mapper.h>
 
 #define DM_MSG_PREFIX "crypt"
-#define MESG_STR(x) x, sizeof(x)
 
 /*
  * context holding the current state of a multi-part conversion
@@ -239,7 +238,7 @@ static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv,
                              struct dm_crypt_request *dmreq)
 {
        memset(iv, 0, cc->iv_size);
-       *(u32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff);
+       *(__le32 *)iv = cpu_to_le32(dmreq->iv_sector & 0xffffffff);
 
        return 0;
 }
@@ -248,7 +247,7 @@ static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv,
                                struct dm_crypt_request *dmreq)
 {
        memset(iv, 0, cc->iv_size);
-       *(u64 *)iv = cpu_to_le64(dmreq->iv_sector);
+       *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector);
 
        return 0;
 }
@@ -415,7 +414,7 @@ static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv,
        struct crypto_cipher *essiv_tfm = this_crypt_config(cc)->iv_private;
 
        memset(iv, 0, cc->iv_size);
-       *(u64 *)iv = cpu_to_le64(dmreq->iv_sector);
+       *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector);
        crypto_cipher_encrypt_one(essiv_tfm, iv, iv);
 
        return 0;
@@ -1575,11 +1574,17 @@ bad_mem:
 static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
        struct crypt_config *cc;
-       unsigned int key_size;
+       unsigned int key_size, opt_params;
        unsigned long long tmpll;
        int ret;
+       struct dm_arg_set as;
+       const char *opt_string;
+
+       static struct dm_arg _args[] = {
+               {0, 1, "Invalid number of feature args"},
+       };
 
-       if (argc != 5) {
+       if (argc < 5) {
                ti->error = "Not enough arguments";
                return -EINVAL;
        }
@@ -1648,6 +1653,30 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
        }
        cc->start = tmpll;
 
+       argv += 5;
+       argc -= 5;
+
+       /* Optional parameters */
+       if (argc) {
+               as.argc = argc;
+               as.argv = argv;
+
+               ret = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
+               if (ret)
+                       goto bad;
+
+               opt_string = dm_shift_arg(&as);
+
+               if (opt_params == 1 && opt_string &&
+                   !strcasecmp(opt_string, "allow_discards"))
+                       ti->num_discard_requests = 1;
+               else if (opt_params) {
+                       ret = -EINVAL;
+                       ti->error = "Invalid feature arguments";
+                       goto bad;
+               }
+       }
+
        ret = -ENOMEM;
        cc->io_queue = alloc_workqueue("kcryptd_io",
                                       WQ_NON_REENTRANT|
@@ -1682,9 +1711,16 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
        struct dm_crypt_io *io;
        struct crypt_config *cc;
 
-       if (bio->bi_rw & REQ_FLUSH) {
+       /*
+        * If bio is REQ_FLUSH or REQ_DISCARD, just bypass crypt queues.
+        * - for REQ_FLUSH device-mapper core ensures that no IO is in-flight
+        * - for REQ_DISCARD caller must use flush if IO ordering matters
+        */
+       if (unlikely(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) {
                cc = ti->private;
                bio->bi_bdev = cc->dev->bdev;
+               if (bio_sectors(bio))
+                       bio->bi_sector = cc->start + dm_target_offset(ti, bio->bi_sector);
                return DM_MAPIO_REMAPPED;
        }
 
@@ -1727,6 +1763,10 @@ static int crypt_status(struct dm_target *ti, status_type_t type,
 
                DMEMIT(" %llu %s %llu", (unsigned long long)cc->iv_offset,
                                cc->dev->name, (unsigned long long)cc->start);
+
+               if (ti->num_discard_requests)
+                       DMEMIT(" 1 allow_discards");
+
                break;
        }
        return 0;
@@ -1770,12 +1810,12 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
        if (argc < 2)
                goto error;
 
-       if (!strnicmp(argv[0], MESG_STR("key"))) {
+       if (!strcasecmp(argv[0], "key")) {
                if (!test_bit(DM_CRYPT_SUSPENDED, &cc->flags)) {
                        DMWARN("not suspended during key manipulation.");
                        return -EINVAL;
                }
-               if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) {
+               if (argc == 3 && !strcasecmp(argv[1], "set")) {
                        ret = crypt_set_key(cc, argv[2]);
                        if (ret)
                                return ret;
@@ -1783,7 +1823,7 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
                                ret = cc->iv_gen_ops->init(cc);
                        return ret;
                }
-               if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) {
+               if (argc == 2 && !strcasecmp(argv[1], "wipe")) {
                        if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) {
                                ret = cc->iv_gen_ops->wipe(cc);
                                if (ret)
@@ -1823,7 +1863,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
 
 static struct target_type crypt_target = {
        .name   = "crypt",
-       .version = {1, 10, 0},
+       .version = {1, 11, 0},
        .module = THIS_MODULE,
        .ctr    = crypt_ctr,
        .dtr    = crypt_dtr,
index ea790623c30ba0d7522905d6f77f7db95c0cdebd..89f73ca22cfa112e7c215f9ab5c846de80dd5da4 100644 (file)
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) 2003 Sistina Software (UK) Limited.
- * Copyright (C) 2004, 2010 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2004, 2010-2011 Red Hat, Inc. All rights reserved.
  *
  * This file is released under the GPL.
  */
@@ -15,6 +15,9 @@
 
 #define DM_MSG_PREFIX "flakey"
 
+#define all_corrupt_bio_flags_match(bio, fc)   \
+       (((bio)->bi_rw & (fc)->corrupt_bio_flags) == (fc)->corrupt_bio_flags)
+
 /*
  * Flakey: Used for testing only, simulates intermittent,
  * catastrophic device failure.
@@ -25,60 +28,189 @@ struct flakey_c {
        sector_t start;
        unsigned up_interval;
        unsigned down_interval;
+       unsigned long flags;
+       unsigned corrupt_bio_byte;
+       unsigned corrupt_bio_rw;
+       unsigned corrupt_bio_value;
+       unsigned corrupt_bio_flags;
+};
+
+enum feature_flag_bits {
+       DROP_WRITES
 };
 
+static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
+                         struct dm_target *ti)
+{
+       int r;
+       unsigned argc;
+       const char *arg_name;
+
+       static struct dm_arg _args[] = {
+               {0, 6, "Invalid number of feature args"},
+               {1, UINT_MAX, "Invalid corrupt bio byte"},
+               {0, 255, "Invalid corrupt value to write into bio byte (0-255)"},
+               {0, UINT_MAX, "Invalid corrupt bio flags mask"},
+       };
+
+       /* No feature arguments supplied. */
+       if (!as->argc)
+               return 0;
+
+       r = dm_read_arg_group(_args, as, &argc, &ti->error);
+       if (r)
+               return r;
+
+       while (argc) {
+               arg_name = dm_shift_arg(as);
+               argc--;
+
+               /*
+                * drop_writes
+                */
+               if (!strcasecmp(arg_name, "drop_writes")) {
+                       if (test_and_set_bit(DROP_WRITES, &fc->flags)) {
+                               ti->error = "Feature drop_writes duplicated";
+                               return -EINVAL;
+                       }
+
+                       continue;
+               }
+
+               /*
+                * corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags>
+                */
+               if (!strcasecmp(arg_name, "corrupt_bio_byte")) {
+                       if (!argc)
+                               ti->error = "Feature corrupt_bio_byte requires parameters";
+
+                       r = dm_read_arg(_args + 1, as, &fc->corrupt_bio_byte, &ti->error);
+                       if (r)
+                               return r;
+                       argc--;
+
+                       /*
+                        * Direction r or w?
+                        */
+                       arg_name = dm_shift_arg(as);
+                       if (!strcasecmp(arg_name, "w"))
+                               fc->corrupt_bio_rw = WRITE;
+                       else if (!strcasecmp(arg_name, "r"))
+                               fc->corrupt_bio_rw = READ;
+                       else {
+                               ti->error = "Invalid corrupt bio direction (r or w)";
+                               return -EINVAL;
+                       }
+                       argc--;
+
+                       /*
+                        * Value of byte (0-255) to write in place of correct one.
+                        */
+                       r = dm_read_arg(_args + 2, as, &fc->corrupt_bio_value, &ti->error);
+                       if (r)
+                               return r;
+                       argc--;
+
+                       /*
+                        * Only corrupt bios with these flags set.
+                        */
+                       r = dm_read_arg(_args + 3, as, &fc->corrupt_bio_flags, &ti->error);
+                       if (r)
+                               return r;
+                       argc--;
+
+                       continue;
+               }
+
+               ti->error = "Unrecognised flakey feature requested";
+               return -EINVAL;
+       }
+
+       if (test_bit(DROP_WRITES, &fc->flags) && (fc->corrupt_bio_rw == WRITE)) {
+               ti->error = "drop_writes is incompatible with corrupt_bio_byte with the WRITE flag set";
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
 /*
- * Construct a flakey mapping: <dev_path> <offset> <up interval> <down interval>
+ * Construct a flakey mapping:
+ * <dev_path> <offset> <up interval> <down interval> [<#feature args> [<arg>]*]
+ *
+ *   Feature args:
+ *     [drop_writes]
+ *     [corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags>]
+ *
+ *   Nth_byte starts from 1 for the first byte.
+ *   Direction is r for READ or w for WRITE.
+ *   bio_flags is ignored if 0.
  */
 static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
+       static struct dm_arg _args[] = {
+               {0, UINT_MAX, "Invalid up interval"},
+               {0, UINT_MAX, "Invalid down interval"},
+       };
+
+       int r;
        struct flakey_c *fc;
-       unsigned long long tmp;
+       unsigned long long tmpll;
+       struct dm_arg_set as;
+       const char *devname;
 
-       if (argc != 4) {
-               ti->error = "dm-flakey: Invalid argument count";
+       as.argc = argc;
+       as.argv = argv;
+
+       if (argc < 4) {
+               ti->error = "Invalid argument count";
                return -EINVAL;
        }
 
-       fc = kmalloc(sizeof(*fc), GFP_KERNEL);
+       fc = kzalloc(sizeof(*fc), GFP_KERNEL);
        if (!fc) {
-               ti->error = "dm-flakey: Cannot allocate linear context";
+               ti->error = "Cannot allocate linear context";
                return -ENOMEM;
        }
        fc->start_time = jiffies;
 
-       if (sscanf(argv[1], "%llu", &tmp) != 1) {
-               ti->error = "dm-flakey: Invalid device sector";
+       devname = dm_shift_arg(&as);
+
+       if (sscanf(dm_shift_arg(&as), "%llu", &tmpll) != 1) {
+               ti->error = "Invalid device sector";
                goto bad;
        }
-       fc->start = tmp;
+       fc->start = tmpll;
 
-       if (sscanf(argv[2], "%u", &fc->up_interval) != 1) {
-               ti->error = "dm-flakey: Invalid up interval";
+       r = dm_read_arg(_args, &as, &fc->up_interval, &ti->error);
+       if (r)
                goto bad;
-       }
 
-       if (sscanf(argv[3], "%u", &fc->down_interval) != 1) {
-               ti->error = "dm-flakey: Invalid down interval";
+       r = dm_read_arg(_args, &as, &fc->down_interval, &ti->error);
+       if (r)
                goto bad;
-       }
 
        if (!(fc->up_interval + fc->down_interval)) {
-               ti->error = "dm-flakey: Total (up + down) interval is zero";
+               ti->error = "Total (up + down) interval is zero";
                goto bad;
        }
 
        if (fc->up_interval + fc->down_interval < fc->up_interval) {
-               ti->error = "dm-flakey: Interval overflow";
+               ti->error = "Interval overflow";
                goto bad;
        }
 
-       if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &fc->dev)) {
-               ti->error = "dm-flakey: Device lookup failed";
+       r = parse_features(&as, fc, ti);
+       if (r)
+               goto bad;
+
+       if (dm_get_device(ti, devname, dm_table_get_mode(ti->table), &fc->dev)) {
+               ti->error = "Device lookup failed";
                goto bad;
        }
 
        ti->num_flush_requests = 1;
+       ti->num_discard_requests = 1;
        ti->private = fc;
        return 0;
 
@@ -99,7 +231,7 @@ static sector_t flakey_map_sector(struct dm_target *ti, sector_t bi_sector)
 {
        struct flakey_c *fc = ti->private;
 
-       return fc->start + (bi_sector - ti->begin);
+       return fc->start + dm_target_offset(ti, bi_sector);
 }
 
 static void flakey_map_bio(struct dm_target *ti, struct bio *bio)
@@ -111,6 +243,25 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio)
                bio->bi_sector = flakey_map_sector(ti, bio->bi_sector);
 }
 
+static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc)
+{
+       unsigned bio_bytes = bio_cur_bytes(bio);
+       char *data = bio_data(bio);
+
+       /*
+        * Overwrite the Nth byte of the data returned.
+        */
+       if (data && bio_bytes >= fc->corrupt_bio_byte) {
+               data[fc->corrupt_bio_byte - 1] = fc->corrupt_bio_value;
+
+               DMDEBUG("Corrupting data bio=%p by writing %u to byte %u "
+                       "(rw=%c bi_rw=%lu bi_sector=%llu cur_bytes=%u)\n",
+                       bio, fc->corrupt_bio_value, fc->corrupt_bio_byte,
+                       (bio_data_dir(bio) == WRITE) ? 'w' : 'r',
+                       bio->bi_rw, (unsigned long long)bio->bi_sector, bio_bytes);
+       }
+}
+
 static int flakey_map(struct dm_target *ti, struct bio *bio,
                      union map_info *map_context)
 {
@@ -119,18 +270,71 @@ static int flakey_map(struct dm_target *ti, struct bio *bio,
 
        /* Are we alive ? */
        elapsed = (jiffies - fc->start_time) / HZ;
-       if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval)
+       if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) {
+               /*
+                * Flag this bio as submitted while down.
+                */
+               map_context->ll = 1;
+
+               /*
+                * Map reads as normal.
+                */
+               if (bio_data_dir(bio) == READ)
+                       goto map_bio;
+
+               /*
+                * Drop writes?
+                */
+               if (test_bit(DROP_WRITES, &fc->flags)) {
+                       bio_endio(bio, 0);
+                       return DM_MAPIO_SUBMITTED;
+               }
+
+               /*
+                * Corrupt matching writes.
+                */
+               if (fc->corrupt_bio_byte && (fc->corrupt_bio_rw == WRITE)) {
+                       if (all_corrupt_bio_flags_match(bio, fc))
+                               corrupt_bio_data(bio, fc);
+                       goto map_bio;
+               }
+
+               /*
+                * By default, error all I/O.
+                */
                return -EIO;
+       }
 
+map_bio:
        flakey_map_bio(ti, bio);
 
        return DM_MAPIO_REMAPPED;
 }
 
+static int flakey_end_io(struct dm_target *ti, struct bio *bio,
+                        int error, union map_info *map_context)
+{
+       struct flakey_c *fc = ti->private;
+       unsigned bio_submitted_while_down = map_context->ll;
+
+       /*
+        * Corrupt successful READs while in down state.
+        * If flags were specified, only corrupt those that match.
+        */
+       if (!error && bio_submitted_while_down &&
+           (bio_data_dir(bio) == READ) && (fc->corrupt_bio_rw == READ) &&
+           all_corrupt_bio_flags_match(bio, fc))
+               corrupt_bio_data(bio, fc);
+
+       return error;
+}
+
 static int flakey_status(struct dm_target *ti, status_type_t type,
                         char *result, unsigned int maxlen)
 {
+       unsigned sz = 0;
        struct flakey_c *fc = ti->private;
+       unsigned drop_writes;
 
        switch (type) {
        case STATUSTYPE_INFO:
@@ -138,9 +342,22 @@ static int flakey_status(struct dm_target *ti, status_type_t type,
                break;
 
        case STATUSTYPE_TABLE:
-               snprintf(result, maxlen, "%s %llu %u %u", fc->dev->name,
-                        (unsigned long long)fc->start, fc->up_interval,
-                        fc->down_interval);
+               DMEMIT("%s %llu %u %u ", fc->dev->name,
+                      (unsigned long long)fc->start, fc->up_interval,
+                      fc->down_interval);
+
+               drop_writes = test_bit(DROP_WRITES, &fc->flags);
+               DMEMIT("%u ", drop_writes + (fc->corrupt_bio_byte > 0) * 5);
+
+               if (drop_writes)
+                       DMEMIT("drop_writes ");
+
+               if (fc->corrupt_bio_byte)
+                       DMEMIT("corrupt_bio_byte %u %c %u %u ",
+                              fc->corrupt_bio_byte,
+                              (fc->corrupt_bio_rw == WRITE) ? 'w' : 'r',
+                              fc->corrupt_bio_value, fc->corrupt_bio_flags);
+
                break;
        }
        return 0;
@@ -177,11 +394,12 @@ static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_
 
 static struct target_type flakey_target = {
        .name   = "flakey",
-       .version = {1, 1, 0},
+       .version = {1, 2, 0},
        .module = THIS_MODULE,
        .ctr    = flakey_ctr,
        .dtr    = flakey_dtr,
        .map    = flakey_map,
+       .end_io = flakey_end_io,
        .status = flakey_status,
        .ioctl  = flakey_ioctl,
        .merge  = flakey_merge,
index 2067288f61f9b5e868da28c6c7c9694b9a5269cf..ad2eba40e3190e700eab3136b8af5dd150cbf208 100644 (file)
@@ -38,6 +38,8 @@ struct io {
        struct dm_io_client *client;
        io_notify_fn callback;
        void *context;
+       void *vma_invalidate_address;
+       unsigned long vma_invalidate_size;
 } __attribute__((aligned(DM_IO_MAX_REGIONS)));
 
 static struct kmem_cache *_dm_io_cache;
@@ -116,6 +118,10 @@ static void dec_count(struct io *io, unsigned int region, int error)
                set_bit(region, &io->error_bits);
 
        if (atomic_dec_and_test(&io->count)) {
+               if (io->vma_invalidate_size)
+                       invalidate_kernel_vmap_range(io->vma_invalidate_address,
+                                                    io->vma_invalidate_size);
+
                if (io->sleeper)
                        wake_up_process(io->sleeper);
 
@@ -159,6 +165,9 @@ struct dpages {
 
        unsigned context_u;
        void *context_ptr;
+
+       void *vma_invalidate_address;
+       unsigned long vma_invalidate_size;
 };
 
 /*
@@ -377,6 +386,9 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
        io->sleeper = current;
        io->client = client;
 
+       io->vma_invalidate_address = dp->vma_invalidate_address;
+       io->vma_invalidate_size = dp->vma_invalidate_size;
+
        dispatch_io(rw, num_regions, where, dp, io, 1);
 
        while (1) {
@@ -415,13 +427,21 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions,
        io->callback = fn;
        io->context = context;
 
+       io->vma_invalidate_address = dp->vma_invalidate_address;
+       io->vma_invalidate_size = dp->vma_invalidate_size;
+
        dispatch_io(rw, num_regions, where, dp, io, 0);
        return 0;
 }
 
-static int dp_init(struct dm_io_request *io_req, struct dpages *dp)
+static int dp_init(struct dm_io_request *io_req, struct dpages *dp,
+                  unsigned long size)
 {
        /* Set up dpages based on memory type */
+
+       dp->vma_invalidate_address = NULL;
+       dp->vma_invalidate_size = 0;
+
        switch (io_req->mem.type) {
        case DM_IO_PAGE_LIST:
                list_dp_init(dp, io_req->mem.ptr.pl, io_req->mem.offset);
@@ -432,6 +452,11 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp)
                break;
 
        case DM_IO_VMA:
+               flush_kernel_vmap_range(io_req->mem.ptr.vma, size);
+               if ((io_req->bi_rw & RW_MASK) == READ) {
+                       dp->vma_invalidate_address = io_req->mem.ptr.vma;
+                       dp->vma_invalidate_size = size;
+               }
                vm_dp_init(dp, io_req->mem.ptr.vma);
                break;
 
@@ -460,7 +485,7 @@ int dm_io(struct dm_io_request *io_req, unsigned num_regions,
        int r;
        struct dpages dp;
 
-       r = dp_init(io_req, &dp);
+       r = dp_init(io_req, &dp, (unsigned long)where->count << SECTOR_SHIFT);
        if (r)
                return r;
 
index 4cacdad2270a345e1ef8455eddf5c2a2838a22e1..2e9a3ca37bdd39c6cbaf36f800b77aa3a04a638f 100644 (file)
@@ -128,6 +128,24 @@ static struct hash_cell *__get_uuid_cell(const char *str)
        return NULL;
 }
 
+static struct hash_cell *__get_dev_cell(uint64_t dev)
+{
+       struct mapped_device *md;
+       struct hash_cell *hc;
+
+       md = dm_get_md(huge_decode_dev(dev));
+       if (!md)
+               return NULL;
+
+       hc = dm_get_mdptr(md);
+       if (!hc) {
+               dm_put(md);
+               return NULL;
+       }
+
+       return hc;
+}
+
 /*-----------------------------------------------------------------
  * Inserting, removing and renaming a device.
  *---------------------------------------------------------------*/
@@ -718,25 +736,45 @@ static int dev_create(struct dm_ioctl *param, size_t param_size)
  */
 static struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param)
 {
-       struct mapped_device *md;
-       void *mdptr = NULL;
+       struct hash_cell *hc = NULL;
 
-       if (*param->uuid)
-               return __get_uuid_cell(param->uuid);
+       if (*param->uuid) {
+               if (*param->name || param->dev)
+                       return NULL;
 
-       if (*param->name)
-               return __get_name_cell(param->name);
+               hc = __get_uuid_cell(param->uuid);
+               if (!hc)
+                       return NULL;
+       } else if (*param->name) {
+               if (param->dev)
+                       return NULL;
 
-       md = dm_get_md(huge_decode_dev(param->dev));
-       if (!md)
-               goto out;
+               hc = __get_name_cell(param->name);
+               if (!hc)
+                       return NULL;
+       } else if (param->dev) {
+               hc = __get_dev_cell(param->dev);
+               if (!hc)
+                       return NULL;
+       } else
+               return NULL;
 
-       mdptr = dm_get_mdptr(md);
-       if (!mdptr)
-               dm_put(md);
+       /*
+        * Sneakily write in both the name and the uuid
+        * while we have the cell.
+        */
+       strlcpy(param->name, hc->name, sizeof(param->name));
+       if (hc->uuid)
+               strlcpy(param->uuid, hc->uuid, sizeof(param->uuid));
+       else
+               param->uuid[0] = '\0';
 
-out:
-       return mdptr;
+       if (hc->new_map)
+               param->flags |= DM_INACTIVE_PRESENT_FLAG;
+       else
+               param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
+
+       return hc;
 }
 
 static struct mapped_device *find_device(struct dm_ioctl *param)
@@ -746,24 +784,8 @@ static struct mapped_device *find_device(struct dm_ioctl *param)
 
        down_read(&_hash_lock);
        hc = __find_device_hash_cell(param);
-       if (hc) {
+       if (hc)
                md = hc->md;
-
-               /*
-                * Sneakily write in both the name and the uuid
-                * while we have the cell.
-                */
-               strlcpy(param->name, hc->name, sizeof(param->name));
-               if (hc->uuid)
-                       strlcpy(param->uuid, hc->uuid, sizeof(param->uuid));
-               else
-                       param->uuid[0] = '\0';
-
-               if (hc->new_map)
-                       param->flags |= DM_INACTIVE_PRESENT_FLAG;
-               else
-                       param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
-       }
        up_read(&_hash_lock);
 
        return md;
@@ -1402,6 +1424,11 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
                goto out;
        }
 
+       if (!argc) {
+               DMWARN("Empty message received.");
+               goto out;
+       }
+
        table = dm_get_live_table(md);
        if (!table)
                goto out_argv;
index 320401dec1044215b16bd1c784c1fca2883b3537..f82147029636d87b30e990daa230dfb17ca69c50 100644 (file)
@@ -224,8 +224,6 @@ struct kcopyd_job {
        unsigned int num_dests;
        struct dm_io_region dests[DM_KCOPYD_MAX_REGIONS];
 
-       sector_t offset;
-       unsigned int nr_pages;
        struct page_list *pages;
 
        /*
@@ -380,7 +378,7 @@ static int run_io_job(struct kcopyd_job *job)
                .bi_rw = job->rw,
                .mem.type = DM_IO_PAGE_LIST,
                .mem.ptr.pl = job->pages,
-               .mem.offset = job->offset,
+               .mem.offset = 0,
                .notify.fn = complete_io,
                .notify.context = job,
                .client = job->kc->io_client,
@@ -397,10 +395,9 @@ static int run_io_job(struct kcopyd_job *job)
 static int run_pages_job(struct kcopyd_job *job)
 {
        int r;
+       unsigned nr_pages = dm_div_up(job->dests[0].count, PAGE_SIZE >> 9);
 
-       job->nr_pages = dm_div_up(job->dests[0].count + job->offset,
-                                 PAGE_SIZE >> 9);
-       r = kcopyd_get_pages(job->kc, job->nr_pages, &job->pages);
+       r = kcopyd_get_pages(job->kc, nr_pages, &job->pages);
        if (!r) {
                /* this job is ready for io */
                push(&job->kc->io_jobs, job);
@@ -602,8 +599,6 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
        job->num_dests = num_dests;
        memcpy(&job->dests, dests, sizeof(*dests) * num_dests);
 
-       job->offset = 0;
-       job->nr_pages = 0;
        job->pages = NULL;
 
        job->fn = fn;
@@ -622,6 +617,37 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
 }
 EXPORT_SYMBOL(dm_kcopyd_copy);
 
+void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc,
+                                dm_kcopyd_notify_fn fn, void *context)
+{
+       struct kcopyd_job *job;
+
+       job = mempool_alloc(kc->job_pool, GFP_NOIO);
+
+       memset(job, 0, sizeof(struct kcopyd_job));
+       job->kc = kc;
+       job->fn = fn;
+       job->context = context;
+
+       atomic_inc(&kc->nr_jobs);
+
+       return job;
+}
+EXPORT_SYMBOL(dm_kcopyd_prepare_callback);
+
+void dm_kcopyd_do_callback(void *j, int read_err, unsigned long write_err)
+{
+       struct kcopyd_job *job = j;
+       struct dm_kcopyd_client *kc = job->kc;
+
+       job->read_err = read_err;
+       job->write_err = write_err;
+
+       push(&kc->complete_jobs, job);
+       wake(kc);
+}
+EXPORT_SYMBOL(dm_kcopyd_do_callback);
+
 /*
  * Cancels a kcopyd job, eg. someone might be deactivating a
  * mirror.
index aa2e0c374ab3e0c985e6bdc6536807bc07bcf3ce..1021c89860116a5bb6e3a2faea1362c0ea805f52 100644 (file)
@@ -394,8 +394,7 @@ static int flush_by_group(struct log_c *lc, struct list_head *flush_list)
                        group[count] = fe->region;
                        count++;
 
-                       list_del(&fe->list);
-                       list_add(&fe->list, &tmp_list);
+                       list_move(&fe->list, &tmp_list);
 
                        type = fe->type;
                        if (count >= MAX_FLUSH_GROUP_COUNT)
index 948e3f4925bfe6d28a39aeed22e11324f1b1b8df..3b52bb72bd1f0cb8717798c121008a76784b9aa0 100644 (file)
@@ -197,15 +197,21 @@ EXPORT_SYMBOL(dm_dirty_log_destroy);
 #define MIRROR_DISK_VERSION 2
 #define LOG_OFFSET 2
 
-struct log_header {
-       uint32_t magic;
+struct log_header_disk {
+       __le32 magic;
 
        /*
         * Simple, incrementing version. no backward
         * compatibility.
         */
+       __le32 version;
+       __le64 nr_regions;
+} __packed;
+
+struct log_header_core {
+       uint32_t magic;
        uint32_t version;
-       sector_t nr_regions;
+       uint64_t nr_regions;
 };
 
 struct log_c {
@@ -239,10 +245,10 @@ struct log_c {
        int log_dev_failed;
        int log_dev_flush_failed;
        struct dm_dev *log_dev;
-       struct log_header header;
+       struct log_header_core header;
 
        struct dm_io_region header_location;
-       struct log_header *disk_header;
+       struct log_header_disk *disk_header;
 };
 
 /*
@@ -251,34 +257,34 @@ struct log_c {
  */
 static inline int log_test_bit(uint32_t *bs, unsigned bit)
 {
-       return test_bit_le(bit, (unsigned long *) bs) ? 1 : 0;
+       return test_bit_le(bit, bs) ? 1 : 0;
 }
 
 static inline void log_set_bit(struct log_c *l,
                               uint32_t *bs, unsigned bit)
 {
-       __test_and_set_bit_le(bit, (unsigned long *) bs);
+       __set_bit_le(bit, bs);
        l->touched_cleaned = 1;
 }
 
 static inline void log_clear_bit(struct log_c *l,
                                 uint32_t *bs, unsigned bit)
 {
-       __test_and_clear_bit_le(bit, (unsigned long *) bs);
+       __clear_bit_le(bit, bs);
        l->touched_dirtied = 1;
 }
 
 /*----------------------------------------------------------------
  * Header IO
  *--------------------------------------------------------------*/
-static void header_to_disk(struct log_header *core, struct log_header *disk)
+static void header_to_disk(struct log_header_core *core, struct log_header_disk *disk)
 {
        disk->magic = cpu_to_le32(core->magic);
        disk->version = cpu_to_le32(core->version);
        disk->nr_regions = cpu_to_le64(core->nr_regions);
 }
 
-static void header_from_disk(struct log_header *core, struct log_header *disk)
+static void header_from_disk(struct log_header_core *core, struct log_header_disk *disk)
 {
        core->magic = le32_to_cpu(disk->magic);
        core->version = le32_to_cpu(disk->version);
@@ -486,7 +492,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
        memset(lc->sync_bits, (sync == NOSYNC) ? -1 : 0, bitset_size);
        lc->sync_count = (sync == NOSYNC) ? region_count : 0;
 
-       lc->recovering_bits = vmalloc(bitset_size);
+       lc->recovering_bits = vzalloc(bitset_size);
        if (!lc->recovering_bits) {
                DMWARN("couldn't allocate sync bitset");
                vfree(lc->sync_bits);
@@ -498,7 +504,6 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
                kfree(lc);
                return -ENOMEM;
        }
-       memset(lc->recovering_bits, 0, bitset_size);
        lc->sync_search = 0;
        log->context = lc;
 
@@ -739,8 +744,7 @@ static int core_get_resync_work(struct dm_dirty_log *log, region_t *region)
                return 0;
 
        do {
-               *region = find_next_zero_bit_le(
-                                            (unsigned long *) lc->sync_bits,
+               *region = find_next_zero_bit_le(lc->sync_bits,
                                             lc->region_count,
                                             lc->sync_search);
                lc->sync_search = *region + 1;
index c3547016f0f1e7156d1b28b0fded62b5bf534008..5e0090ef4182e71de26afec04d23edb19b3f4e39 100644 (file)
@@ -22,7 +22,6 @@
 #include <linux/atomic.h>
 
 #define DM_MSG_PREFIX "multipath"
-#define MESG_STR(x) x, sizeof(x)
 #define DM_PG_INIT_DELAY_MSECS 2000
 #define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1)
 
@@ -505,80 +504,29 @@ static void trigger_event(struct work_struct *work)
  *      <#paths> <#per-path selector args>
  *         [<path> [<arg>]* ]+ ]+
  *---------------------------------------------------------------*/
-struct param {
-       unsigned min;
-       unsigned max;
-       char *error;
-};
-
-static int read_param(struct param *param, char *str, unsigned *v, char **error)
-{
-       if (!str ||
-           (sscanf(str, "%u", v) != 1) ||
-           (*v < param->min) ||
-           (*v > param->max)) {
-               *error = param->error;
-               return -EINVAL;
-       }
-
-       return 0;
-}
-
-struct arg_set {
-       unsigned argc;
-       char **argv;
-};
-
-static char *shift(struct arg_set *as)
-{
-       char *r;
-
-       if (as->argc) {
-               as->argc--;
-               r = *as->argv;
-               as->argv++;
-               return r;
-       }
-
-       return NULL;
-}
-
-static void consume(struct arg_set *as, unsigned n)
-{
-       BUG_ON (as->argc < n);
-       as->argc -= n;
-       as->argv += n;
-}
-
-static int parse_path_selector(struct arg_set *as, struct priority_group *pg,
+static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg,
                               struct dm_target *ti)
 {
        int r;
        struct path_selector_type *pst;
        unsigned ps_argc;
 
-       static struct param _params[] = {
+       static struct dm_arg _args[] = {
                {0, 1024, "invalid number of path selector args"},
        };
 
-       pst = dm_get_path_selector(shift(as));
+       pst = dm_get_path_selector(dm_shift_arg(as));
        if (!pst) {
                ti->error = "unknown path selector type";
                return -EINVAL;
        }
 
-       r = read_param(_params, shift(as), &ps_argc, &ti->error);
+       r = dm_read_arg_group(_args, as, &ps_argc, &ti->error);
        if (r) {
                dm_put_path_selector(pst);
                return -EINVAL;
        }
 
-       if (ps_argc > as->argc) {
-               dm_put_path_selector(pst);
-               ti->error = "not enough arguments for path selector";
-               return -EINVAL;
-       }
-
        r = pst->create(&pg->ps, ps_argc, as->argv);
        if (r) {
                dm_put_path_selector(pst);
@@ -587,12 +535,12 @@ static int parse_path_selector(struct arg_set *as, struct priority_group *pg,
        }
 
        pg->ps.type = pst;
-       consume(as, ps_argc);
+       dm_consume_args(as, ps_argc);
 
        return 0;
 }
 
-static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
+static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps,
                               struct dm_target *ti)
 {
        int r;
@@ -609,7 +557,7 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
        if (!p)
                return ERR_PTR(-ENOMEM);
 
-       r = dm_get_device(ti, shift(as), dm_table_get_mode(ti->table),
+       r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
                          &p->path.dev);
        if (r) {
                ti->error = "error getting device";
@@ -660,16 +608,16 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
        return ERR_PTR(r);
 }
 
-static struct priority_group *parse_priority_group(struct arg_set *as,
+static struct priority_group *parse_priority_group(struct dm_arg_set *as,
                                                   struct multipath *m)
 {
-       static struct param _params[] = {
+       static struct dm_arg _args[] = {
                {1, 1024, "invalid number of paths"},
                {0, 1024, "invalid number of selector args"}
        };
 
        int r;
-       unsigned i, nr_selector_args, nr_params;
+       unsigned i, nr_selector_args, nr_args;
        struct priority_group *pg;
        struct dm_target *ti = m->ti;
 
@@ -693,26 +641,26 @@ static struct priority_group *parse_priority_group(struct arg_set *as,
        /*
         * read the paths
         */
-       r = read_param(_params, shift(as), &pg->nr_pgpaths, &ti->error);
+       r = dm_read_arg(_args, as, &pg->nr_pgpaths, &ti->error);
        if (r)
                goto bad;
 
-       r = read_param(_params + 1, shift(as), &nr_selector_args, &ti->error);
+       r = dm_read_arg(_args + 1, as, &nr_selector_args, &ti->error);
        if (r)
                goto bad;
 
-       nr_params = 1 + nr_selector_args;
+       nr_args = 1 + nr_selector_args;
        for (i = 0; i < pg->nr_pgpaths; i++) {
                struct pgpath *pgpath;
-               struct arg_set path_args;
+               struct dm_arg_set path_args;
 
-               if (as->argc < nr_params) {
+               if (as->argc < nr_args) {
                        ti->error = "not enough path parameters";
                        r = -EINVAL;
                        goto bad;
                }
 
-               path_args.argc = nr_params;
+               path_args.argc = nr_args;
                path_args.argv = as->argv;
 
                pgpath = parse_path(&path_args, &pg->ps, ti);
@@ -723,7 +671,7 @@ static struct priority_group *parse_priority_group(struct arg_set *as,
 
                pgpath->pg = pg;
                list_add_tail(&pgpath->list, &pg->pgpaths);
-               consume(as, nr_params);
+               dm_consume_args(as, nr_args);
        }
 
        return pg;
@@ -733,28 +681,23 @@ static struct priority_group *parse_priority_group(struct arg_set *as,
        return ERR_PTR(r);
 }
 
-static int parse_hw_handler(struct arg_set *as, struct multipath *m)
+static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m)
 {
        unsigned hw_argc;
        int ret;
        struct dm_target *ti = m->ti;
 
-       static struct param _params[] = {
+       static struct dm_arg _args[] = {
                {0, 1024, "invalid number of hardware handler args"},
        };
 
-       if (read_param(_params, shift(as), &hw_argc, &ti->error))
+       if (dm_read_arg_group(_args, as, &hw_argc, &ti->error))
                return -EINVAL;
 
        if (!hw_argc)
                return 0;
 
-       if (hw_argc > as->argc) {
-               ti->error = "not enough arguments for hardware handler";
-               return -EINVAL;
-       }
-
-       m->hw_handler_name = kstrdup(shift(as), GFP_KERNEL);
+       m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL);
        request_module("scsi_dh_%s", m->hw_handler_name);
        if (scsi_dh_handler_exist(m->hw_handler_name) == 0) {
                ti->error = "unknown hardware handler type";
@@ -778,7 +721,7 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m)
                for (i = 0, p+=j+1; i <= hw_argc - 2; i++, p+=j+1)
                        j = sprintf(p, "%s", as->argv[i]);
        }
-       consume(as, hw_argc - 1);
+       dm_consume_args(as, hw_argc - 1);
 
        return 0;
 fail:
@@ -787,20 +730,20 @@ fail:
        return ret;
 }
 
-static int parse_features(struct arg_set *as, struct multipath *m)
+static int parse_features(struct dm_arg_set *as, struct multipath *m)
 {
        int r;
        unsigned argc;
        struct dm_target *ti = m->ti;
-       const char *param_name;
+       const char *arg_name;
 
-       static struct param _params[] = {
+       static struct dm_arg _args[] = {
                {0, 5, "invalid number of feature args"},
                {1, 50, "pg_init_retries must be between 1 and 50"},
                {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"},
        };
 
-       r = read_param(_params, shift(as), &argc, &ti->error);
+       r = dm_read_arg_group(_args, as, &argc, &ti->error);
        if (r)
                return -EINVAL;
 
@@ -808,26 +751,24 @@ static int parse_features(struct arg_set *as, struct multipath *m)
                return 0;
 
        do {
-               param_name = shift(as);
+               arg_name = dm_shift_arg(as);
                argc--;
 
-               if (!strnicmp(param_name, MESG_STR("queue_if_no_path"))) {
+               if (!strcasecmp(arg_name, "queue_if_no_path")) {
                        r = queue_if_no_path(m, 1, 0);
                        continue;
                }
 
-               if (!strnicmp(param_name, MESG_STR("pg_init_retries")) &&
+               if (!strcasecmp(arg_name, "pg_init_retries") &&
                    (argc >= 1)) {
-                       r = read_param(_params + 1, shift(as),
-                                      &m->pg_init_retries, &ti->error);
+                       r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error);
                        argc--;
                        continue;
                }
 
-               if (!strnicmp(param_name, MESG_STR("pg_init_delay_msecs")) &&
+               if (!strcasecmp(arg_name, "pg_init_delay_msecs") &&
                    (argc >= 1)) {
-                       r = read_param(_params + 2, shift(as),
-                                      &m->pg_init_delay_msecs, &ti->error);
+                       r = dm_read_arg(_args + 2, as, &m->pg_init_delay_msecs, &ti->error);
                        argc--;
                        continue;
                }
@@ -842,15 +783,15 @@ static int parse_features(struct arg_set *as, struct multipath *m)
 static int multipath_ctr(struct dm_target *ti, unsigned int argc,
                         char **argv)
 {
-       /* target parameters */
-       static struct param _params[] = {
+       /* target arguments */
+       static struct dm_arg _args[] = {
                {0, 1024, "invalid number of priority groups"},
                {0, 1024, "invalid initial priority group number"},
        };
 
        int r;
        struct multipath *m;
-       struct arg_set as;
+       struct dm_arg_set as;
        unsigned pg_count = 0;
        unsigned next_pg_num;
 
@@ -871,11 +812,11 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
        if (r)
                goto bad;
 
-       r = read_param(_params, shift(&as), &m->nr_priority_groups, &ti->error);
+       r = dm_read_arg(_args, &as, &m->nr_priority_groups, &ti->error);
        if (r)
                goto bad;
 
-       r = read_param(_params + 1, shift(&as), &next_pg_num, &ti->error);
+       r = dm_read_arg(_args + 1, &as, &next_pg_num, &ti->error);
        if (r)
                goto bad;
 
@@ -1505,10 +1446,10 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
        }
 
        if (argc == 1) {
-               if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) {
+               if (!strcasecmp(argv[0], "queue_if_no_path")) {
                        r = queue_if_no_path(m, 1, 0);
                        goto out;
-               } else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path"))) {
+               } else if (!strcasecmp(argv[0], "fail_if_no_path")) {
                        r = queue_if_no_path(m, 0, 0);
                        goto out;
                }
@@ -1519,18 +1460,18 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
                goto out;
        }
 
-       if (!strnicmp(argv[0], MESG_STR("disable_group"))) {
+       if (!strcasecmp(argv[0], "disable_group")) {
                r = bypass_pg_num(m, argv[1], 1);
                goto out;
-       } else if (!strnicmp(argv[0], MESG_STR("enable_group"))) {
+       } else if (!strcasecmp(argv[0], "enable_group")) {
                r = bypass_pg_num(m, argv[1], 0);
                goto out;
-       } else if (!strnicmp(argv[0], MESG_STR("switch_group"))) {
+       } else if (!strcasecmp(argv[0], "switch_group")) {
                r = switch_pg_num(m, argv[1]);
                goto out;
-       } else if (!strnicmp(argv[0], MESG_STR("reinstate_path")))
+       } else if (!strcasecmp(argv[0], "reinstate_path"))
                action = reinstate_path;
-       else if (!strnicmp(argv[0], MESG_STR("fail_path")))
+       else if (!strcasecmp(argv[0], "fail_path"))
                action = fail_path;
        else {
                DMWARN("Unrecognised multipath message received.");
index e5d8904fc8f647162d4a7d79150491797cfda6d2..a002dd85db1e674e2efbc188a531f001c2d0716b 100644 (file)
@@ -8,19 +8,19 @@
 #include <linux/slab.h>
 
 #include "md.h"
+#include "raid1.h"
 #include "raid5.h"
-#include "dm.h"
 #include "bitmap.h"
 
+#include <linux/device-mapper.h>
+
 #define DM_MSG_PREFIX "raid"
 
 /*
- * If the MD doesn't support MD_SYNC_STATE_FORCED yet, then
- * make it so the flag doesn't set anything.
+ * The following flags are used by dm-raid.c to set up the array state.
+ * They must be cleared before md_run is called.
  */
-#ifndef MD_SYNC_STATE_FORCED
-#define MD_SYNC_STATE_FORCED 0
-#endif
+#define FirstUse 10             /* rdev flag */
 
 struct raid_dev {
        /*
@@ -43,14 +43,15 @@ struct raid_dev {
 /*
  * Flags for rs->print_flags field.
  */
-#define DMPF_DAEMON_SLEEP      0x1
-#define DMPF_MAX_WRITE_BEHIND  0x2
-#define DMPF_SYNC              0x4
-#define DMPF_NOSYNC            0x8
-#define DMPF_STRIPE_CACHE      0x10
-#define DMPF_MIN_RECOVERY_RATE 0x20
-#define DMPF_MAX_RECOVERY_RATE 0x40
-
+#define DMPF_SYNC              0x1
+#define DMPF_NOSYNC            0x2
+#define DMPF_REBUILD           0x4
+#define DMPF_DAEMON_SLEEP      0x8
+#define DMPF_MIN_RECOVERY_RATE 0x10
+#define DMPF_MAX_RECOVERY_RATE 0x20
+#define DMPF_MAX_WRITE_BEHIND  0x40
+#define DMPF_STRIPE_CACHE      0x80
+#define DMPF_REGION_SIZE       0X100
 struct raid_set {
        struct dm_target *ti;
 
@@ -72,6 +73,7 @@ static struct raid_type {
        const unsigned level;           /* RAID level. */
        const unsigned algorithm;       /* RAID algorithm. */
 } raid_types[] = {
+       {"raid1",    "RAID1 (mirroring)",               0, 2, 1, 0 /* NONE */},
        {"raid4",    "RAID4 (dedicated parity disk)",   1, 2, 5, ALGORITHM_PARITY_0},
        {"raid5_la", "RAID5 (left asymmetric)",         1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
        {"raid5_ra", "RAID5 (right asymmetric)",        1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
@@ -105,7 +107,8 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra
        }
 
        sectors_per_dev = ti->len;
-       if (sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) {
+       if ((raid_type->level > 1) &&
+           sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) {
                ti->error = "Target length not divisible by number of data devices";
                return ERR_PTR(-EINVAL);
        }
@@ -147,9 +150,16 @@ static void context_free(struct raid_set *rs)
 {
        int i;
 
-       for (i = 0; i < rs->md.raid_disks; i++)
+       for (i = 0; i < rs->md.raid_disks; i++) {
+               if (rs->dev[i].meta_dev)
+                       dm_put_device(rs->ti, rs->dev[i].meta_dev);
+               if (rs->dev[i].rdev.sb_page)
+                       put_page(rs->dev[i].rdev.sb_page);
+               rs->dev[i].rdev.sb_page = NULL;
+               rs->dev[i].rdev.sb_loaded = 0;
                if (rs->dev[i].data_dev)
                        dm_put_device(rs->ti, rs->dev[i].data_dev);
+       }
 
        kfree(rs);
 }
@@ -159,7 +169,16 @@ static void context_free(struct raid_set *rs)
  *  <meta_dev>: meta device name or '-' if missing
  *  <data_dev>: data device name or '-' if missing
  *
- * This code parses those words.
+ * The following are permitted:
+ *    - -
+ *    - <data_dev>
+ *    <meta_dev> <data_dev>
+ *
+ * The following is not allowed:
+ *    <meta_dev> -
+ *
+ * This code parses those words.  If there is a failure,
+ * the caller must use context_free to unwind the operations.
  */
 static int dev_parms(struct raid_set *rs, char **argv)
 {
@@ -182,8 +201,16 @@ static int dev_parms(struct raid_set *rs, char **argv)
                rs->dev[i].rdev.mddev = &rs->md;
 
                if (strcmp(argv[0], "-")) {
-                       rs->ti->error = "Metadata devices not supported";
-                       return -EINVAL;
+                       ret = dm_get_device(rs->ti, argv[0],
+                                           dm_table_get_mode(rs->ti->table),
+                                           &rs->dev[i].meta_dev);
+                       rs->ti->error = "RAID metadata device lookup failure";
+                       if (ret)
+                               return ret;
+
+                       rs->dev[i].rdev.sb_page = alloc_page(GFP_KERNEL);
+                       if (!rs->dev[i].rdev.sb_page)
+                               return -ENOMEM;
                }
 
                if (!strcmp(argv[1], "-")) {
@@ -193,6 +220,10 @@ static int dev_parms(struct raid_set *rs, char **argv)
                                return -EINVAL;
                        }
 
+                       rs->ti->error = "No data device supplied with metadata device";
+                       if (rs->dev[i].meta_dev)
+                               return -EINVAL;
+
                        continue;
                }
 
@@ -204,6 +235,10 @@ static int dev_parms(struct raid_set *rs, char **argv)
                        return ret;
                }
 
+               if (rs->dev[i].meta_dev) {
+                       metadata_available = 1;
+                       rs->dev[i].rdev.meta_bdev = rs->dev[i].meta_dev->bdev;
+               }
                rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev;
                list_add(&rs->dev[i].rdev.same_set, &rs->md.disks);
                if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
@@ -234,34 +269,110 @@ static int dev_parms(struct raid_set *rs, char **argv)
        return 0;
 }
 
+/*
+ * validate_region_size
+ * @rs
+ * @region_size:  region size in sectors.  If 0, pick a size (4MiB default).
+ *
+ * Set rs->md.bitmap_info.chunksize (which really refers to 'region size').
+ * Ensure that (ti->len/region_size < 2^21) - required by MD bitmap.
+ *
+ * Returns: 0 on success, -EINVAL on failure.
+ */
+static int validate_region_size(struct raid_set *rs, unsigned long region_size)
+{
+       unsigned long min_region_size = rs->ti->len / (1 << 21);
+
+       if (!region_size) {
+               /*
+                * Choose a reasonable default.  All figures in sectors.
+                */
+               if (min_region_size > (1 << 13)) {
+                       DMINFO("Choosing default region size of %lu sectors",
+                              region_size);
+                       region_size = min_region_size;
+               } else {
+                       DMINFO("Choosing default region size of 4MiB");
+                       region_size = 1 << 13; /* sectors */
+               }
+       } else {
+               /*
+                * Validate user-supplied value.
+                */
+               if (region_size > rs->ti->len) {
+                       rs->ti->error = "Supplied region size is too large";
+                       return -EINVAL;
+               }
+
+               if (region_size < min_region_size) {
+                       DMERR("Supplied region_size (%lu sectors) below minimum (%lu)",
+                             region_size, min_region_size);
+                       rs->ti->error = "Supplied region size is too small";
+                       return -EINVAL;
+               }
+
+               if (!is_power_of_2(region_size)) {
+                       rs->ti->error = "Region size is not a power of 2";
+                       return -EINVAL;
+               }
+
+               if (region_size < rs->md.chunk_sectors) {
+                       rs->ti->error = "Region size is smaller than the chunk size";
+                       return -EINVAL;
+               }
+       }
+
+       /*
+        * Convert sectors to bytes.
+        */
+       rs->md.bitmap_info.chunksize = (region_size << 9);
+
+       return 0;
+}
+
 /*
  * Possible arguments are...
- * RAID456:
  *     <chunk_size> [optional_args]
  *
- * Optional args:
- *    [[no]sync]                       Force or prevent recovery of the entire array
+ * Argument definitions
+ *    <chunk_size>                     The number of sectors per disk that
+ *                                      will form the "stripe"
+ *    [[no]sync]                       Force or prevent recovery of the
+ *                                      entire array
  *    [rebuild <idx>]                  Rebuild the drive indicated by the index
- *    [daemon_sleep <ms>]              Time between bitmap daemon work to clear bits
+ *    [daemon_sleep <ms>]              Time between bitmap daemon work to
+ *                                      clear bits
  *    [min_recovery_rate <kB/sec/disk>]        Throttle RAID initialization
  *    [max_recovery_rate <kB/sec/disk>]        Throttle RAID initialization
+ *    [write_mostly <idx>]             Indicate a write mostly drive via index
  *    [max_write_behind <sectors>]     See '-write-behind=' (man mdadm)
  *    [stripe_cache <sectors>]         Stripe cache size for higher RAIDs
+ *    [region_size <sectors>]           Defines granularity of bitmap
  */
 static int parse_raid_params(struct raid_set *rs, char **argv,
                             unsigned num_raid_params)
 {
        unsigned i, rebuild_cnt = 0;
-       unsigned long value;
+       unsigned long value, region_size = 0;
        char *key;
 
        /*
         * First, parse the in-order required arguments
+        * "chunk_size" is the only argument of this type.
         */
-       if ((strict_strtoul(argv[0], 10, &value) < 0) ||
-           !is_power_of_2(value) || (value < 8)) {
+       if ((strict_strtoul(argv[0], 10, &value) < 0)) {
                rs->ti->error = "Bad chunk size";
                return -EINVAL;
+       } else if (rs->raid_type->level == 1) {
+               if (value)
+                       DMERR("Ignoring chunk size parameter for RAID 1");
+               value = 0;
+       } else if (!is_power_of_2(value)) {
+               rs->ti->error = "Chunk size must be a power of 2";
+               return -EINVAL;
+       } else if (value < 8) {
+               rs->ti->error = "Chunk size value is too small";
+               return -EINVAL;
        }
 
        rs->md.new_chunk_sectors = rs->md.chunk_sectors = value;
@@ -269,22 +380,39 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
        num_raid_params--;
 
        /*
-        * Second, parse the unordered optional arguments
+        * We set each individual device as In_sync with a completed
+        * 'recovery_offset'.  If there has been a device failure or
+        * replacement then one of the following cases applies:
+        *
+        *   1) User specifies 'rebuild'.
+        *      - Device is reset when param is read.
+        *   2) A new device is supplied.
+        *      - No matching superblock found, resets device.
+        *   3) Device failure was transient and returns on reload.
+        *      - Failure noticed, resets device for bitmap replay.
+        *   4) Device hadn't completed recovery after previous failure.
+        *      - Superblock is read and overrides recovery_offset.
+        *
+        * What is found in the superblocks of the devices is always
+        * authoritative, unless 'rebuild' or '[no]sync' was specified.
         */
-       for (i = 0; i < rs->md.raid_disks; i++)
+       for (i = 0; i < rs->md.raid_disks; i++) {
                set_bit(In_sync, &rs->dev[i].rdev.flags);
+               rs->dev[i].rdev.recovery_offset = MaxSector;
+       }
 
+       /*
+        * Second, parse the unordered optional arguments
+        */
        for (i = 0; i < num_raid_params; i++) {
-               if (!strcmp(argv[i], "nosync")) {
+               if (!strcasecmp(argv[i], "nosync")) {
                        rs->md.recovery_cp = MaxSector;
                        rs->print_flags |= DMPF_NOSYNC;
-                       rs->md.flags |= MD_SYNC_STATE_FORCED;
                        continue;
                }
-               if (!strcmp(argv[i], "sync")) {
+               if (!strcasecmp(argv[i], "sync")) {
                        rs->md.recovery_cp = 0;
                        rs->print_flags |= DMPF_SYNC;
-                       rs->md.flags |= MD_SYNC_STATE_FORCED;
                        continue;
                }
 
@@ -300,9 +428,13 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
                        return -EINVAL;
                }
 
-               if (!strcmp(key, "rebuild")) {
-                       if (++rebuild_cnt > rs->raid_type->parity_devs) {
-                               rs->ti->error = "Too many rebuild drives given";
+               if (!strcasecmp(key, "rebuild")) {
+                       rebuild_cnt++;
+                       if (((rs->raid_type->level != 1) &&
+                            (rebuild_cnt > rs->raid_type->parity_devs)) ||
+                           ((rs->raid_type->level == 1) &&
+                            (rebuild_cnt > (rs->md.raid_disks - 1)))) {
+                               rs->ti->error = "Too many rebuild devices specified for given RAID type";
                                return -EINVAL;
                        }
                        if (value > rs->md.raid_disks) {
@@ -311,7 +443,22 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
                        }
                        clear_bit(In_sync, &rs->dev[value].rdev.flags);
                        rs->dev[value].rdev.recovery_offset = 0;
-               } else if (!strcmp(key, "max_write_behind")) {
+                       rs->print_flags |= DMPF_REBUILD;
+               } else if (!strcasecmp(key, "write_mostly")) {
+                       if (rs->raid_type->level != 1) {
+                               rs->ti->error = "write_mostly option is only valid for RAID1";
+                               return -EINVAL;
+                       }
+                       if (value > rs->md.raid_disks) {
+                               rs->ti->error = "Invalid write_mostly drive index given";
+                               return -EINVAL;
+                       }
+                       set_bit(WriteMostly, &rs->dev[value].rdev.flags);
+               } else if (!strcasecmp(key, "max_write_behind")) {
+                       if (rs->raid_type->level != 1) {
+                               rs->ti->error = "max_write_behind option is only valid for RAID1";
+                               return -EINVAL;
+                       }
                        rs->print_flags |= DMPF_MAX_WRITE_BEHIND;
 
                        /*
@@ -324,14 +471,14 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
                                return -EINVAL;
                        }
                        rs->md.bitmap_info.max_write_behind = value;
-               } else if (!strcmp(key, "daemon_sleep")) {
+               } else if (!strcasecmp(key, "daemon_sleep")) {
                        rs->print_flags |= DMPF_DAEMON_SLEEP;
                        if (!value || (value > MAX_SCHEDULE_TIMEOUT)) {
                                rs->ti->error = "daemon sleep period out of range";
                                return -EINVAL;
                        }
                        rs->md.bitmap_info.daemon_sleep = value;
-               } else if (!strcmp(key, "stripe_cache")) {
+               } else if (!strcasecmp(key, "stripe_cache")) {
                        rs->print_flags |= DMPF_STRIPE_CACHE;
 
                        /*
@@ -348,20 +495,23 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
                                rs->ti->error = "Bad stripe_cache size";
                                return -EINVAL;
                        }
-               } else if (!strcmp(key, "min_recovery_rate")) {
+               } else if (!strcasecmp(key, "min_recovery_rate")) {
                        rs->print_flags |= DMPF_MIN_RECOVERY_RATE;
                        if (value > INT_MAX) {
                                rs->ti->error = "min_recovery_rate out of range";
                                return -EINVAL;
                        }
                        rs->md.sync_speed_min = (int)value;
-               } else if (!strcmp(key, "max_recovery_rate")) {
+               } else if (!strcasecmp(key, "max_recovery_rate")) {
                        rs->print_flags |= DMPF_MAX_RECOVERY_RATE;
                        if (value > INT_MAX) {
                                rs->ti->error = "max_recovery_rate out of range";
                                return -EINVAL;
                        }
                        rs->md.sync_speed_max = (int)value;
+               } else if (!strcasecmp(key, "region_size")) {
+                       rs->print_flags |= DMPF_REGION_SIZE;
+                       region_size = value;
                } else {
                        DMERR("Unable to parse RAID parameter: %s", key);
                        rs->ti->error = "Unable to parse RAID parameters";
@@ -369,6 +519,19 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
                }
        }
 
+       if (validate_region_size(rs, region_size))
+               return -EINVAL;
+
+       if (rs->md.chunk_sectors)
+               rs->ti->split_io = rs->md.chunk_sectors;
+       else
+               rs->ti->split_io = region_size;
+
+       if (rs->md.chunk_sectors)
+               rs->ti->split_io = rs->md.chunk_sectors;
+       else
+               rs->ti->split_io = region_size;
+
        /* Assume there are no metadata devices until the drives are parsed */
        rs->md.persistent = 0;
        rs->md.external = 1;
@@ -387,17 +550,351 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
 {
        struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
 
+       if (rs->raid_type->level == 1)
+               return md_raid1_congested(&rs->md, bits);
+
        return md_raid5_congested(&rs->md, bits);
 }
 
+/*
+ * This structure is never routinely used by userspace, unlike md superblocks.
+ * Devices with this superblock should only ever be accessed via device-mapper.
+ */
+#define DM_RAID_MAGIC 0x64526D44
+struct dm_raid_superblock {
+       __le32 magic;           /* "DmRd" */
+       __le32 features;        /* Used to indicate possible future changes */
+
+       __le32 num_devices;     /* Number of devices in this array. (Max 64) */
+       __le32 array_position;  /* The position of this drive in the array */
+
+       __le64 events;          /* Incremented by md when superblock updated */
+       __le64 failed_devices;  /* Bit field of devices to indicate failures */
+
+       /*
+        * This offset tracks the progress of the repair or replacement of
+        * an individual drive.
+        */
+       __le64 disk_recovery_offset;
+
+       /*
+        * This offset tracks the progress of the initial array
+        * synchronisation/parity calculation.
+        */
+       __le64 array_resync_offset;
+
+       /*
+        * RAID characteristics
+        */
+       __le32 level;
+       __le32 layout;
+       __le32 stripe_sectors;
+
+       __u8 pad[452];          /* Round struct to 512 bytes. */
+                               /* Always set to 0 when writing. */
+} __packed;
+
+static int read_disk_sb(mdk_rdev_t *rdev, int size)
+{
+       BUG_ON(!rdev->sb_page);
+
+       if (rdev->sb_loaded)
+               return 0;
+
+       if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) {
+               DMERR("Failed to read device superblock");
+               return -EINVAL;
+       }
+
+       rdev->sb_loaded = 1;
+
+       return 0;
+}
+
+static void super_sync(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+       mdk_rdev_t *r, *t;
+       uint64_t failed_devices;
+       struct dm_raid_superblock *sb;
+
+       sb = page_address(rdev->sb_page);
+       failed_devices = le64_to_cpu(sb->failed_devices);
+
+       rdev_for_each(r, t, mddev)
+               if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags))
+                       failed_devices |= (1ULL << r->raid_disk);
+
+       memset(sb, 0, sizeof(*sb));
+
+       sb->magic = cpu_to_le32(DM_RAID_MAGIC);
+       sb->features = cpu_to_le32(0);  /* No features yet */
+
+       sb->num_devices = cpu_to_le32(mddev->raid_disks);
+       sb->array_position = cpu_to_le32(rdev->raid_disk);
+
+       sb->events = cpu_to_le64(mddev->events);
+       sb->failed_devices = cpu_to_le64(failed_devices);
+
+       sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset);
+       sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp);
+
+       sb->level = cpu_to_le32(mddev->level);
+       sb->layout = cpu_to_le32(mddev->layout);
+       sb->stripe_sectors = cpu_to_le32(mddev->chunk_sectors);
+}
+
+/*
+ * super_load
+ *
+ * This function creates a superblock if one is not found on the device
+ * and will decide which superblock to use if there's a choice.
+ *
+ * Return: 1 if use rdev, 0 if use refdev, -Exxx otherwise
+ */
+static int super_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev)
+{
+       int ret;
+       struct dm_raid_superblock *sb;
+       struct dm_raid_superblock *refsb;
+       uint64_t events_sb, events_refsb;
+
+       rdev->sb_start = 0;
+       rdev->sb_size = sizeof(*sb);
+
+       ret = read_disk_sb(rdev, rdev->sb_size);
+       if (ret)
+               return ret;
+
+       sb = page_address(rdev->sb_page);
+       if (sb->magic != cpu_to_le32(DM_RAID_MAGIC)) {
+               super_sync(rdev->mddev, rdev);
+
+               set_bit(FirstUse, &rdev->flags);
+
+               /* Force writing of superblocks to disk */
+               set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags);
+
+               /* Any superblock is better than none, choose that if given */
+               return refdev ? 0 : 1;
+       }
+
+       if (!refdev)
+               return 1;
+
+       events_sb = le64_to_cpu(sb->events);
+
+       refsb = page_address(refdev->sb_page);
+       events_refsb = le64_to_cpu(refsb->events);
+
+       return (events_sb > events_refsb) ? 1 : 0;
+}
+
+static int super_init_validation(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+       int role;
+       struct raid_set *rs = container_of(mddev, struct raid_set, md);
+       uint64_t events_sb;
+       uint64_t failed_devices;
+       struct dm_raid_superblock *sb;
+       uint32_t new_devs = 0;
+       uint32_t rebuilds = 0;
+       mdk_rdev_t *r, *t;
+       struct dm_raid_superblock *sb2;
+
+       sb = page_address(rdev->sb_page);
+       events_sb = le64_to_cpu(sb->events);
+       failed_devices = le64_to_cpu(sb->failed_devices);
+
+       /*
+        * Initialise to 1 if this is a new superblock.
+        */
+       mddev->events = events_sb ? : 1;
+
+       /*
+        * Reshaping is not currently allowed
+        */
+       if ((le32_to_cpu(sb->level) != mddev->level) ||
+           (le32_to_cpu(sb->layout) != mddev->layout) ||
+           (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) {
+               DMERR("Reshaping arrays not yet supported.");
+               return -EINVAL;
+       }
+
+       /* We can only change the number of devices in RAID1 right now */
+       if ((rs->raid_type->level != 1) &&
+           (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) {
+               DMERR("Reshaping arrays not yet supported.");
+               return -EINVAL;
+       }
+
+       if (!(rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)))
+               mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset);
+
+       /*
+        * During load, we set FirstUse if a new superblock was written.
+        * There are two reasons we might not have a superblock:
+        * 1) The array is brand new - in which case, all of the
+        *    devices must have their In_sync bit set.  Also,
+        *    recovery_cp must be 0, unless forced.
+        * 2) This is a new device being added to an old array
+        *    and the new device needs to be rebuilt - in which
+        *    case the In_sync bit will /not/ be set and
+        *    recovery_cp must be MaxSector.
+        */
+       rdev_for_each(r, t, mddev) {
+               if (!test_bit(In_sync, &r->flags)) {
+                       if (!test_bit(FirstUse, &r->flags))
+                               DMERR("Superblock area of "
+                                     "rebuild device %d should have been "
+                                     "cleared.", r->raid_disk);
+                       set_bit(FirstUse, &r->flags);
+                       rebuilds++;
+               } else if (test_bit(FirstUse, &r->flags))
+                       new_devs++;
+       }
+
+       if (!rebuilds) {
+               if (new_devs == mddev->raid_disks) {
+                       DMINFO("Superblocks created for new array");
+                       set_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
+               } else if (new_devs) {
+                       DMERR("New device injected "
+                             "into existing array without 'rebuild' "
+                             "parameter specified");
+                       return -EINVAL;
+               }
+       } else if (new_devs) {
+               DMERR("'rebuild' devices cannot be "
+                     "injected into an array with other first-time devices");
+               return -EINVAL;
+       } else if (mddev->recovery_cp != MaxSector) {
+               DMERR("'rebuild' specified while array is not in-sync");
+               return -EINVAL;
+       }
+
+       /*
+        * Now we set the Faulty bit for those devices that are
+        * recorded in the superblock as failed.
+        */
+       rdev_for_each(r, t, mddev) {
+               if (!r->sb_page)
+                       continue;
+               sb2 = page_address(r->sb_page);
+               sb2->failed_devices = 0;
+
+               /*
+                * Check for any device re-ordering.
+                */
+               if (!test_bit(FirstUse, &r->flags) && (r->raid_disk >= 0)) {
+                       role = le32_to_cpu(sb2->array_position);
+                       if (role != r->raid_disk) {
+                               if (rs->raid_type->level != 1) {
+                                       rs->ti->error = "Cannot change device "
+                                               "positions in RAID array";
+                                       return -EINVAL;
+                               }
+                               DMINFO("RAID1 device #%d now at position #%d",
+                                      role, r->raid_disk);
+                       }
+
+                       /*
+                        * Partial recovery is performed on
+                        * returning failed devices.
+                        */
+                       if (failed_devices & (1 << role))
+                               set_bit(Faulty, &r->flags);
+               }
+       }
+
+       return 0;
+}
+
+static int super_validate(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+       struct dm_raid_superblock *sb = page_address(rdev->sb_page);
+
+       /*
+        * If mddev->events is not set, we know we have not yet initialized
+        * the array.
+        */
+       if (!mddev->events && super_init_validation(mddev, rdev))
+               return -EINVAL;
+
+       mddev->bitmap_info.offset = 4096 >> 9; /* Enable bitmap creation */
+       rdev->mddev->bitmap_info.default_offset = 4096 >> 9;
+       if (!test_bit(FirstUse, &rdev->flags)) {
+               rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset);
+               if (rdev->recovery_offset != MaxSector)
+                       clear_bit(In_sync, &rdev->flags);
+       }
+
+       /*
+        * If a device comes back, set it as not In_sync and no longer faulty.
+        */
+       if (test_bit(Faulty, &rdev->flags)) {
+               clear_bit(Faulty, &rdev->flags);
+               clear_bit(In_sync, &rdev->flags);
+               rdev->saved_raid_disk = rdev->raid_disk;
+               rdev->recovery_offset = 0;
+       }
+
+       clear_bit(FirstUse, &rdev->flags);
+
+       return 0;
+}
+
+/*
+ * Analyse superblocks and select the freshest.
+ */
+static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
+{
+       int ret;
+       mdk_rdev_t *rdev, *freshest, *tmp;
+       mddev_t *mddev = &rs->md;
+
+       freshest = NULL;
+       rdev_for_each(rdev, tmp, mddev) {
+               if (!rdev->meta_bdev)
+                       continue;
+
+               ret = super_load(rdev, freshest);
+
+               switch (ret) {
+               case 1:
+                       freshest = rdev;
+                       break;
+               case 0:
+                       break;
+               default:
+                       ti->error = "Failed to load superblock";
+                       return ret;
+               }
+       }
+
+       if (!freshest)
+               return 0;
+
+       /*
+        * Validation of the freshest device provides the source of
+        * validation for the remaining devices.
+        */
+       ti->error = "Unable to assemble array: Invalid superblocks";
+       if (super_validate(mddev, freshest))
+               return -EINVAL;
+
+       rdev_for_each(rdev, tmp, mddev)
+               if ((rdev != freshest) && super_validate(mddev, rdev))
+                       return -EINVAL;
+
+       return 0;
+}
+
 /*
  * Construct a RAID4/5/6 mapping:
  * Args:
  *     <raid_type> <#raid_params> <raid_params>                \
  *     <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> }
  *
- * ** metadata devices are not supported yet, use '-' instead **
- *
  * <raid_params> varies by <raid_type>.  See 'parse_raid_params' for
  * details on possible <raid_params>.
  */
@@ -465,8 +962,12 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
        if (ret)
                goto bad;
 
+       rs->md.sync_super = super_sync;
+       ret = analyse_superblocks(ti, rs);
+       if (ret)
+               goto bad;
+
        INIT_WORK(&rs->md.event_work, do_table_event);
-       ti->split_io = rs->md.chunk_sectors;
        ti->private = rs;
 
        mutex_lock(&rs->md.reconfig_mutex);
@@ -482,6 +983,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
        rs->callbacks.congested_fn = raid_is_congested;
        dm_table_add_target_callbacks(ti->table, &rs->callbacks);
 
+       mddev_suspend(&rs->md);
        return 0;
 
 bad:
@@ -546,12 +1048,17 @@ static int raid_status(struct dm_target *ti, status_type_t type,
                break;
        case STATUSTYPE_TABLE:
                /* The string you would use to construct this array */
-               for (i = 0; i < rs->md.raid_disks; i++)
-                       if (rs->dev[i].data_dev &&
+               for (i = 0; i < rs->md.raid_disks; i++) {
+                       if ((rs->print_flags & DMPF_REBUILD) &&
+                           rs->dev[i].data_dev &&
                            !test_bit(In_sync, &rs->dev[i].rdev.flags))
-                               raid_param_cnt++; /* for rebuilds */
+                               raid_param_cnt += 2; /* for rebuilds */
+                       if (rs->dev[i].data_dev &&
+                           test_bit(WriteMostly, &rs->dev[i].rdev.flags))
+                               raid_param_cnt += 2;
+               }
 
-               raid_param_cnt += (hweight64(rs->print_flags) * 2);
+               raid_param_cnt += (hweight64(rs->print_flags & ~DMPF_REBUILD) * 2);
                if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))
                        raid_param_cnt--;
 
@@ -565,7 +1072,8 @@ static int raid_status(struct dm_target *ti, status_type_t type,
                        DMEMIT(" nosync");
 
                for (i = 0; i < rs->md.raid_disks; i++)
-                       if (rs->dev[i].data_dev &&
+                       if ((rs->print_flags & DMPF_REBUILD) &&
+                           rs->dev[i].data_dev &&
                            !test_bit(In_sync, &rs->dev[i].rdev.flags))
                                DMEMIT(" rebuild %u", i);
 
@@ -579,6 +1087,11 @@ static int raid_status(struct dm_target *ti, status_type_t type,
                if (rs->print_flags & DMPF_MAX_RECOVERY_RATE)
                        DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max);
 
+               for (i = 0; i < rs->md.raid_disks; i++)
+                       if (rs->dev[i].data_dev &&
+                           test_bit(WriteMostly, &rs->dev[i].rdev.flags))
+                               DMEMIT(" write_mostly %u", i);
+
                if (rs->print_flags & DMPF_MAX_WRITE_BEHIND)
                        DMEMIT(" max_write_behind %lu",
                               rs->md.bitmap_info.max_write_behind);
@@ -591,9 +1104,16 @@ static int raid_status(struct dm_target *ti, status_type_t type,
                               conf ? conf->max_nr_stripes * 2 : 0);
                }
 
+               if (rs->print_flags & DMPF_REGION_SIZE)
+                       DMEMIT(" region_size %lu",
+                              rs->md.bitmap_info.chunksize >> 9);
+
                DMEMIT(" %d", rs->md.raid_disks);
                for (i = 0; i < rs->md.raid_disks; i++) {
-                       DMEMIT(" -"); /* metadata device */
+                       if (rs->dev[i].meta_dev)
+                               DMEMIT(" %s", rs->dev[i].meta_dev->name);
+                       else
+                               DMEMIT(" -");
 
                        if (rs->dev[i].data_dev)
                                DMEMIT(" %s", rs->dev[i].data_dev->name);
@@ -650,12 +1170,13 @@ static void raid_resume(struct dm_target *ti)
 {
        struct raid_set *rs = ti->private;
 
+       bitmap_load(&rs->md);
        mddev_resume(&rs->md);
 }
 
 static struct target_type raid_target = {
        .name = "raid",
-       .version = {1, 0, 0},
+       .version = {1, 1, 0},
        .module = THIS_MODULE,
        .ctr = raid_ctr,
        .dtr = raid_dtr,
index 135c2f1fdbfcc95aefb0d842030e71a089e87d30..d1f1d70171038cf9836832b68c63ad4b1d15297a 100644 (file)
 #define NUM_SNAPSHOT_HDR_CHUNKS 1
 
 struct disk_header {
-       uint32_t magic;
+       __le32 magic;
 
        /*
         * Is this snapshot valid.  There is no way of recovering
         * an invalid snapshot.
         */
-       uint32_t valid;
+       __le32 valid;
 
        /*
         * Simple, incrementing version. no backward
         * compatibility.
         */
-       uint32_t version;
+       __le32 version;
 
        /* In sectors */
-       uint32_t chunk_size;
-};
+       __le32 chunk_size;
+} __packed;
 
 struct disk_exception {
+       __le64 old_chunk;
+       __le64 new_chunk;
+} __packed;
+
+struct core_exception {
        uint64_t old_chunk;
        uint64_t new_chunk;
 };
@@ -169,10 +174,9 @@ static int alloc_area(struct pstore *ps)
        if (!ps->area)
                goto err_area;
 
-       ps->zero_area = vmalloc(len);
+       ps->zero_area = vzalloc(len);
        if (!ps->zero_area)
                goto err_zero_area;
-       memset(ps->zero_area, 0, len);
 
        ps->header_area = vmalloc(len);
        if (!ps->header_area)
@@ -396,32 +400,32 @@ static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
 }
 
 static void read_exception(struct pstore *ps,
-                          uint32_t index, struct disk_exception *result)
+                          uint32_t index, struct core_exception *result)
 {
-       struct disk_exception *e = get_exception(ps, index);
+       struct disk_exception *de = get_exception(ps, index);
 
        /* copy it */
-       result->old_chunk = le64_to_cpu(e->old_chunk);
-       result->new_chunk = le64_to_cpu(e->new_chunk);
+       result->old_chunk = le64_to_cpu(de->old_chunk);
+       result->new_chunk = le64_to_cpu(de->new_chunk);
 }
 
 static void write_exception(struct pstore *ps,
-                           uint32_t index, struct disk_exception *de)
+                           uint32_t index, struct core_exception *e)
 {
-       struct disk_exception *e = get_exception(ps, index);
+       struct disk_exception *de = get_exception(ps, index);
 
        /* copy it */
-       e->old_chunk = cpu_to_le64(de->old_chunk);
-       e->new_chunk = cpu_to_le64(de->new_chunk);
+       de->old_chunk = cpu_to_le64(e->old_chunk);
+       de->new_chunk = cpu_to_le64(e->new_chunk);
 }
 
 static void clear_exception(struct pstore *ps, uint32_t index)
 {
-       struct disk_exception *e = get_exception(ps, index);
+       struct disk_exception *de = get_exception(ps, index);
 
        /* clear it */
-       e->old_chunk = 0;
-       e->new_chunk = 0;
+       de->old_chunk = 0;
+       de->new_chunk = 0;
 }
 
 /*
@@ -437,13 +441,13 @@ static int insert_exceptions(struct pstore *ps,
 {
        int r;
        unsigned int i;
-       struct disk_exception de;
+       struct core_exception e;
 
        /* presume the area is full */
        *full = 1;
 
        for (i = 0; i < ps->exceptions_per_area; i++) {
-               read_exception(ps, i, &de);
+               read_exception(ps, i, &e);
 
                /*
                 * If the new_chunk is pointing at the start of
@@ -451,7 +455,7 @@ static int insert_exceptions(struct pstore *ps,
                 * is we know that we've hit the end of the
                 * exceptions.  Therefore the area is not full.
                 */
-               if (de.new_chunk == 0LL) {
+               if (e.new_chunk == 0LL) {
                        ps->current_committed = i;
                        *full = 0;
                        break;
@@ -460,13 +464,13 @@ static int insert_exceptions(struct pstore *ps,
                /*
                 * Keep track of the start of the free chunks.
                 */
-               if (ps->next_free <= de.new_chunk)
-                       ps->next_free = de.new_chunk + 1;
+               if (ps->next_free <= e.new_chunk)
+                       ps->next_free = e.new_chunk + 1;
 
                /*
                 * Otherwise we add the exception to the snapshot.
                 */
-               r = callback(callback_context, de.old_chunk, de.new_chunk);
+               r = callback(callback_context, e.old_chunk, e.new_chunk);
                if (r)
                        return r;
        }
@@ -563,7 +567,7 @@ static int persistent_read_metadata(struct dm_exception_store *store,
        ps->exceptions_per_area = (ps->store->chunk_size << SECTOR_SHIFT) /
                                  sizeof(struct disk_exception);
        ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
-                       sizeof(*ps->callbacks));
+                                  sizeof(*ps->callbacks));
        if (!ps->callbacks)
                return -ENOMEM;
 
@@ -641,12 +645,12 @@ static void persistent_commit_exception(struct dm_exception_store *store,
 {
        unsigned int i;
        struct pstore *ps = get_info(store);
-       struct disk_exception de;
+       struct core_exception ce;
        struct commit_callback *cb;
 
-       de.old_chunk = e->old_chunk;
-       de.new_chunk = e->new_chunk;
-       write_exception(ps, ps->current_committed++, &de);
+       ce.old_chunk = e->old_chunk;
+       ce.new_chunk = e->new_chunk;
+       write_exception(ps, ps->current_committed++, &ce);
 
        /*
         * Add the callback to the back of the array.  This code
@@ -670,7 +674,7 @@ static void persistent_commit_exception(struct dm_exception_store *store,
         * If we completely filled the current area, then wipe the next one.
         */
        if ((ps->current_committed == ps->exceptions_per_area) &&
-            zero_disk_area(ps, ps->current_area + 1))
+           zero_disk_area(ps, ps->current_area + 1))
                ps->valid = 0;
 
        /*
@@ -701,7 +705,7 @@ static int persistent_prepare_merge(struct dm_exception_store *store,
                                    chunk_t *last_new_chunk)
 {
        struct pstore *ps = get_info(store);
-       struct disk_exception de;
+       struct core_exception ce;
        int nr_consecutive;
        int r;
 
@@ -722,9 +726,9 @@ static int persistent_prepare_merge(struct dm_exception_store *store,
                ps->current_committed = ps->exceptions_per_area;
        }
 
-       read_exception(ps, ps->current_committed - 1, &de);
-       *last_old_chunk = de.old_chunk;
-       *last_new_chunk = de.new_chunk;
+       read_exception(ps, ps->current_committed - 1, &ce);
+       *last_old_chunk = ce.old_chunk;
+       *last_new_chunk = ce.new_chunk;
 
        /*
         * Find number of consecutive chunks within the current area,
@@ -733,9 +737,9 @@ static int persistent_prepare_merge(struct dm_exception_store *store,
        for (nr_consecutive = 1; nr_consecutive < ps->current_committed;
             nr_consecutive++) {
                read_exception(ps, ps->current_committed - 1 - nr_consecutive,
-                              &de);
-               if (de.old_chunk != *last_old_chunk - nr_consecutive ||
-                   de.new_chunk != *last_new_chunk - nr_consecutive)
+                              &ce);
+               if (ce.old_chunk != *last_old_chunk - nr_consecutive ||
+                   ce.new_chunk != *last_new_chunk - nr_consecutive)
                        break;
        }
 
@@ -753,7 +757,7 @@ static int persistent_commit_merge(struct dm_exception_store *store,
        for (i = 0; i < nr_merged; i++)
                clear_exception(ps, ps->current_committed - 1 - i);
 
-       r = area_io(ps, WRITE);
+       r = area_io(ps, WRITE_FLUSH_FUA);
        if (r < 0)
                return r;
 
index 9ecff5f3023a4c4f0721b958ef1541e762e910ff..6f758870fc19cf0e66db3055dab36ab96e038fee 100644 (file)
@@ -29,16 +29,6 @@ static const char dm_snapshot_merge_target_name[] = "snapshot-merge";
 #define dm_target_is_snapshot_merge(ti) \
        ((ti)->type->name == dm_snapshot_merge_target_name)
 
-/*
- * The percentage increment we will wake up users at
- */
-#define WAKE_UP_PERCENT 5
-
-/*
- * kcopyd priority of snapshot operations
- */
-#define SNAPSHOT_COPY_PRIORITY 2
-
 /*
  * The size of the mempool used to track chunks in use.
  */
@@ -180,6 +170,13 @@ struct dm_snap_pending_exception {
         * kcopyd.
         */
        int started;
+
+       /*
+        * For writing a complete chunk, bypassing the copy.
+        */
+       struct bio *full_bio;
+       bio_end_io_t *full_bio_end_io;
+       void *full_bio_private;
 };
 
 /*
@@ -1055,8 +1052,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
        s = kmalloc(sizeof(*s), GFP_KERNEL);
        if (!s) {
-               ti->error = "Cannot allocate snapshot context private "
-                   "structure";
+               ti->error = "Cannot allocate private snapshot structure";
                r = -ENOMEM;
                goto bad;
        }
@@ -1380,6 +1376,7 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
        struct dm_snapshot *s = pe->snap;
        struct bio *origin_bios = NULL;
        struct bio *snapshot_bios = NULL;
+       struct bio *full_bio = NULL;
        int error = 0;
 
        if (!success) {
@@ -1415,10 +1412,15 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
         */
        dm_insert_exception(&s->complete, e);
 
- out:
+out:
        dm_remove_exception(&pe->e);
        snapshot_bios = bio_list_get(&pe->snapshot_bios);
        origin_bios = bio_list_get(&pe->origin_bios);
+       full_bio = pe->full_bio;
+       if (full_bio) {
+               full_bio->bi_end_io = pe->full_bio_end_io;
+               full_bio->bi_private = pe->full_bio_private;
+       }
        free_pending_exception(pe);
 
        increment_pending_exceptions_done_count();
@@ -1426,10 +1428,15 @@ static void pending_complete(struct dm_snap_pending_exception *pe, int success)
        up_write(&s->lock);
 
        /* Submit any pending write bios */
-       if (error)
+       if (error) {
+               if (full_bio)
+                       bio_io_error(full_bio);
                error_bios(snapshot_bios);
-       else
+       } else {
+               if (full_bio)
+                       bio_endio(full_bio, 0);
                flush_bios(snapshot_bios);
+       }
 
        retry_origin_bios(s, origin_bios);
 }
@@ -1480,8 +1487,33 @@ static void start_copy(struct dm_snap_pending_exception *pe)
        dest.count = src.count;
 
        /* Hand over to kcopyd */
-       dm_kcopyd_copy(s->kcopyd_client,
-                   &src, 1, &dest, 0, copy_callback, pe);
+       dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe);
+}
+
+static void full_bio_end_io(struct bio *bio, int error)
+{
+       void *callback_data = bio->bi_private;
+
+       dm_kcopyd_do_callback(callback_data, 0, error ? 1 : 0);
+}
+
+static void start_full_bio(struct dm_snap_pending_exception *pe,
+                          struct bio *bio)
+{
+       struct dm_snapshot *s = pe->snap;
+       void *callback_data;
+
+       pe->full_bio = bio;
+       pe->full_bio_end_io = bio->bi_end_io;
+       pe->full_bio_private = bio->bi_private;
+
+       callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client,
+                                                  copy_callback, pe);
+
+       bio->bi_end_io = full_bio_end_io;
+       bio->bi_private = callback_data;
+
+       generic_make_request(bio);
 }
 
 static struct dm_snap_pending_exception *
@@ -1519,6 +1551,7 @@ __find_pending_exception(struct dm_snapshot *s,
        bio_list_init(&pe->origin_bios);
        bio_list_init(&pe->snapshot_bios);
        pe->started = 0;
+       pe->full_bio = NULL;
 
        if (s->store->type->prepare_exception(s->store, &pe->e)) {
                free_pending_exception(pe);
@@ -1612,10 +1645,19 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
                }
 
                remap_exception(s, &pe->e, bio, chunk);
-               bio_list_add(&pe->snapshot_bios, bio);
 
                r = DM_MAPIO_SUBMITTED;
 
+               if (!pe->started &&
+                   bio->bi_size == (s->store->chunk_size << SECTOR_SHIFT)) {
+                       pe->started = 1;
+                       up_write(&s->lock);
+                       start_full_bio(pe, bio);
+                       goto out;
+               }
+
+               bio_list_add(&pe->snapshot_bios, bio);
+
                if (!pe->started) {
                        /* this is protected by snap->lock */
                        pe->started = 1;
@@ -1628,9 +1670,9 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
                map_context->ptr = track_chunk(s, chunk);
        }
 
- out_unlock:
+out_unlock:
        up_write(&s->lock);
- out:
+out:
        return r;
 }
 
@@ -1974,7 +2016,7 @@ static int __origin_write(struct list_head *snapshots, sector_t sector,
                        pe_to_start_now = pe;
                }
 
- next_snapshot:
+next_snapshot:
                up_write(&snap->lock);
 
                if (pe_to_start_now) {
index bfe9c2333ceacecddd00d058615fc8a9f2867b1b..986b8754bb0813c59fb22a0feae7083740378f72 100644 (file)
@@ -54,7 +54,6 @@ struct dm_table {
        sector_t *highs;
        struct dm_target *targets;
 
-       unsigned discards_supported:1;
        unsigned integrity_supported:1;
 
        /*
@@ -154,12 +153,11 @@ void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size)
                return NULL;
 
        size = nmemb * elem_size;
-       addr = vmalloc(size);
-       if (addr)
-               memset(addr, 0, size);
+       addr = vzalloc(size);
 
        return addr;
 }
+EXPORT_SYMBOL(dm_vcalloc);
 
 /*
  * highs, and targets are managed as dynamic arrays during a
@@ -209,7 +207,6 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
        INIT_LIST_HEAD(&t->devices);
        INIT_LIST_HEAD(&t->target_callbacks);
        atomic_set(&t->holders, 0);
-       t->discards_supported = 1;
 
        if (!num_targets)
                num_targets = KEYS_PER_NODE;
@@ -281,6 +278,7 @@ void dm_table_get(struct dm_table *t)
 {
        atomic_inc(&t->holders);
 }
+EXPORT_SYMBOL(dm_table_get);
 
 void dm_table_put(struct dm_table *t)
 {
@@ -290,6 +288,7 @@ void dm_table_put(struct dm_table *t)
        smp_mb__before_atomic_dec();
        atomic_dec(&t->holders);
 }
+EXPORT_SYMBOL(dm_table_put);
 
 /*
  * Checks to see if we need to extend highs or targets.
@@ -455,13 +454,14 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode,
  * Add a device to the list, or just increment the usage count if
  * it's already present.
  */
-static int __table_get_device(struct dm_table *t, struct dm_target *ti,
-                     const char *path, fmode_t mode, struct dm_dev **result)
+int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
+                 struct dm_dev **result)
 {
        int r;
        dev_t uninitialized_var(dev);
        struct dm_dev_internal *dd;
        unsigned int major, minor;
+       struct dm_table *t = ti->table;
 
        BUG_ON(!t);
 
@@ -509,6 +509,7 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
        *result = &dd->dm_dev;
        return 0;
 }
+EXPORT_SYMBOL(dm_get_device);
 
 int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
                         sector_t start, sector_t len, void *data)
@@ -539,23 +540,15 @@ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
         * If not we'll force DM to use PAGE_SIZE or
         * smaller I/O, just to be safe.
         */
-
-       if (q->merge_bvec_fn && !ti->type->merge)
+       if (dm_queue_merge_is_compulsory(q) && !ti->type->merge)
                blk_limits_max_hw_sectors(limits,
                                          (unsigned int) (PAGE_SIZE >> 9));
        return 0;
 }
 EXPORT_SYMBOL_GPL(dm_set_device_limits);
 
-int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
-                 struct dm_dev **result)
-{
-       return __table_get_device(ti->table, ti, path, mode, result);
-}
-
-
 /*
- * Decrement a devices use count and remove it if necessary.
+ * Decrement a device's use count and remove it if necessary.
  */
 void dm_put_device(struct dm_target *ti, struct dm_dev *d)
 {
@@ -568,6 +561,7 @@ void dm_put_device(struct dm_target *ti, struct dm_dev *d)
                kfree(dd);
        }
 }
+EXPORT_SYMBOL(dm_put_device);
 
 /*
  * Checks to see if the target joins onto the end of the table.
@@ -791,8 +785,9 @@ int dm_table_add_target(struct dm_table *t, const char *type,
 
        t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
 
-       if (!tgt->num_discard_requests)
-               t->discards_supported = 0;
+       if (!tgt->num_discard_requests && tgt->discards_supported)
+               DMWARN("%s: %s: ignoring discards_supported because num_discard_requests is zero.",
+                      dm_device_name(t->md), type);
 
        return 0;
 
@@ -802,6 +797,63 @@ int dm_table_add_target(struct dm_table *t, const char *type,
        return r;
 }
 
+/*
+ * Target argument parsing helpers.
+ */
+static int validate_next_arg(struct dm_arg *arg, struct dm_arg_set *arg_set,
+                            unsigned *value, char **error, unsigned grouped)
+{
+       const char *arg_str = dm_shift_arg(arg_set);
+
+       if (!arg_str ||
+           (sscanf(arg_str, "%u", value) != 1) ||
+           (*value < arg->min) ||
+           (*value > arg->max) ||
+           (grouped && arg_set->argc < *value)) {
+               *error = arg->error;
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+int dm_read_arg(struct dm_arg *arg, struct dm_arg_set *arg_set,
+               unsigned *value, char **error)
+{
+       return validate_next_arg(arg, arg_set, value, error, 0);
+}
+EXPORT_SYMBOL(dm_read_arg);
+
+int dm_read_arg_group(struct dm_arg *arg, struct dm_arg_set *arg_set,
+                     unsigned *value, char **error)
+{
+       return validate_next_arg(arg, arg_set, value, error, 1);
+}
+EXPORT_SYMBOL(dm_read_arg_group);
+
+const char *dm_shift_arg(struct dm_arg_set *as)
+{
+       char *r;
+
+       if (as->argc) {
+               as->argc--;
+               r = *as->argv;
+               as->argv++;
+               return r;
+       }
+
+       return NULL;
+}
+EXPORT_SYMBOL(dm_shift_arg);
+
+void dm_consume_args(struct dm_arg_set *as, unsigned num_args)
+{
+       BUG_ON(as->argc < num_args);
+       as->argc -= num_args;
+       as->argv += num_args;
+}
+EXPORT_SYMBOL(dm_consume_args);
+
 static int dm_table_set_type(struct dm_table *t)
 {
        unsigned i;
@@ -1077,11 +1129,13 @@ void dm_table_event(struct dm_table *t)
                t->event_fn(t->event_context);
        mutex_unlock(&_event_lock);
 }
+EXPORT_SYMBOL(dm_table_event);
 
 sector_t dm_table_get_size(struct dm_table *t)
 {
        return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0;
 }
+EXPORT_SYMBOL(dm_table_get_size);
 
 struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index)
 {
@@ -1194,9 +1248,45 @@ static void dm_table_set_integrity(struct dm_table *t)
                               blk_get_integrity(template_disk));
 }
 
+static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev,
+                               sector_t start, sector_t len, void *data)
+{
+       unsigned flush = (*(unsigned *)data);
+       struct request_queue *q = bdev_get_queue(dev->bdev);
+
+       return q && (q->flush_flags & flush);
+}
+
+static bool dm_table_supports_flush(struct dm_table *t, unsigned flush)
+{
+       struct dm_target *ti;
+       unsigned i = 0;
+
+       /*
+        * Require at least one underlying device to support flushes.
+        * t->devices includes internal dm devices such as mirror logs
+        * so we need to use iterate_devices here, which targets
+        * supporting flushes must provide.
+        */
+       while (i < dm_table_get_num_targets(t)) {
+               ti = dm_table_get_target(t, i++);
+
+               if (!ti->num_flush_requests)
+                       continue;
+
+               if (ti->type->iterate_devices &&
+                   ti->type->iterate_devices(ti, device_flush_capable, &flush))
+                       return 1;
+       }
+
+       return 0;
+}
+
 void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
                               struct queue_limits *limits)
 {
+       unsigned flush = 0;
+
        /*
         * Copy table's limits to the DM device's request_queue
         */
@@ -1207,6 +1297,13 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
        else
                queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
 
+       if (dm_table_supports_flush(t, REQ_FLUSH)) {
+               flush |= REQ_FLUSH;
+               if (dm_table_supports_flush(t, REQ_FUA))
+                       flush |= REQ_FUA;
+       }
+       blk_queue_flush(q, flush);
+
        dm_table_set_integrity(t);
 
        /*
@@ -1237,6 +1334,7 @@ fmode_t dm_table_get_mode(struct dm_table *t)
 {
        return t->mode;
 }
+EXPORT_SYMBOL(dm_table_get_mode);
 
 static void suspend_targets(struct dm_table *t, unsigned postsuspend)
 {
@@ -1345,6 +1443,7 @@ struct mapped_device *dm_table_get_md(struct dm_table *t)
 {
        return t->md;
 }
+EXPORT_SYMBOL(dm_table_get_md);
 
 static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
                                  sector_t start, sector_t len, void *data)
@@ -1359,19 +1458,19 @@ bool dm_table_supports_discards(struct dm_table *t)
        struct dm_target *ti;
        unsigned i = 0;
 
-       if (!t->discards_supported)
-               return 0;
-
        /*
         * Unless any target used by the table set discards_supported,
         * require at least one underlying device to support discards.
         * t->devices includes internal dm devices such as mirror logs
         * so we need to use iterate_devices here, which targets
-        * supporting discard must provide.
+        * supporting discard selectively must provide.
         */
        while (i < dm_table_get_num_targets(t)) {
                ti = dm_table_get_target(t, i++);
 
+               if (!ti->num_discard_requests)
+                       continue;
+
                if (ti->discards_supported)
                        return 1;
 
@@ -1382,13 +1481,3 @@ bool dm_table_supports_discards(struct dm_table *t)
 
        return 0;
 }
-
-EXPORT_SYMBOL(dm_vcalloc);
-EXPORT_SYMBOL(dm_get_device);
-EXPORT_SYMBOL(dm_put_device);
-EXPORT_SYMBOL(dm_table_event);
-EXPORT_SYMBOL(dm_table_get_size);
-EXPORT_SYMBOL(dm_table_get_mode);
-EXPORT_SYMBOL(dm_table_get_md);
-EXPORT_SYMBOL(dm_table_put);
-EXPORT_SYMBOL(dm_table_get);
index 0cf68b478878f327598fb756fe2da173d5443468..52b39f335bb38549045f46eae8cad3714c867c5d 100644 (file)
@@ -37,6 +37,8 @@ static const char *_name = DM_NAME;
 static unsigned int major = 0;
 static unsigned int _major = 0;
 
+static DEFINE_IDR(_minor_idr);
+
 static DEFINE_SPINLOCK(_minor_lock);
 /*
  * For bio-based dm.
@@ -109,6 +111,7 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
 #define DMF_FREEING 3
 #define DMF_DELETING 4
 #define DMF_NOFLUSH_SUSPENDING 5
+#define DMF_MERGE_IS_OPTIONAL 6
 
 /*
  * Work processed by per-device workqueue.
@@ -313,6 +316,12 @@ static void __exit dm_exit(void)
 
        while (i--)
                _exits[i]();
+
+       /*
+        * Should be empty by this point.
+        */
+       idr_remove_all(&_minor_idr);
+       idr_destroy(&_minor_idr);
 }
 
 /*
@@ -1171,7 +1180,8 @@ static int __clone_and_map_discard(struct clone_info *ci)
 
                /*
                 * Even though the device advertised discard support,
-                * reconfiguration might have changed that since the
+                * that does not mean every target supports it, and
+                * reconfiguration might also have changed that since the
                 * check was performed.
                 */
                if (!ti->num_discard_requests)
@@ -1705,8 +1715,6 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
 /*-----------------------------------------------------------------
  * An IDR is used to keep track of allocated minor numbers.
  *---------------------------------------------------------------*/
-static DEFINE_IDR(_minor_idr);
-
 static void free_minor(int minor)
 {
        spin_lock(&_minor_lock);
@@ -1800,7 +1808,6 @@ static void dm_init_md_queue(struct mapped_device *md)
        blk_queue_make_request(md->queue, dm_request);
        blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
        blk_queue_merge_bvec(md->queue, dm_merge_bvec);
-       blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA);
 }
 
 /*
@@ -1985,6 +1992,59 @@ static void __set_size(struct mapped_device *md, sector_t size)
        i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
 }
 
+/*
+ * Return 1 if the queue has a compulsory merge_bvec_fn function.
+ *
+ * If this function returns 0, then the device is either a non-dm
+ * device without a merge_bvec_fn, or it is a dm device that is
+ * able to split any bios it receives that are too big.
+ */
+int dm_queue_merge_is_compulsory(struct request_queue *q)
+{
+       struct mapped_device *dev_md;
+
+       if (!q->merge_bvec_fn)
+               return 0;
+
+       if (q->make_request_fn == dm_request) {
+               dev_md = q->queuedata;
+               if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags))
+                       return 0;
+       }
+
+       return 1;
+}
+
+static int dm_device_merge_is_compulsory(struct dm_target *ti,
+                                        struct dm_dev *dev, sector_t start,
+                                        sector_t len, void *data)
+{
+       struct block_device *bdev = dev->bdev;
+       struct request_queue *q = bdev_get_queue(bdev);
+
+       return dm_queue_merge_is_compulsory(q);
+}
+
+/*
+ * Return 1 if it is acceptable to ignore merge_bvec_fn based
+ * on the properties of the underlying devices.
+ */
+static int dm_table_merge_is_optional(struct dm_table *table)
+{
+       unsigned i = 0;
+       struct dm_target *ti;
+
+       while (i < dm_table_get_num_targets(table)) {
+               ti = dm_table_get_target(table, i++);
+
+               if (ti->type->iterate_devices &&
+                   ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL))
+                       return 0;
+       }
+
+       return 1;
+}
+
 /*
  * Returns old map, which caller must destroy.
  */
@@ -1995,6 +2055,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
        struct request_queue *q = md->queue;
        sector_t size;
        unsigned long flags;
+       int merge_is_optional;
 
        size = dm_table_get_size(t);
 
@@ -2020,10 +2081,16 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
 
        __bind_mempools(md, t);
 
+       merge_is_optional = dm_table_merge_is_optional(t);
+
        write_lock_irqsave(&md->map_lock, flags);
        old_map = md->map;
        md->map = t;
        dm_table_set_restrictions(t, q, limits);
+       if (merge_is_optional)
+               set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
+       else
+               clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
        write_unlock_irqrestore(&md->map_lock, flags);
 
        return old_map;
index 1aaf16746da86f6bfb9c9ffaab334841c044d92a..6745dbd278a4ffc463188c72cab4f301a74adc74 100644 (file)
@@ -66,6 +66,8 @@ int dm_table_alloc_md_mempools(struct dm_table *t);
 void dm_table_free_md_mempools(struct dm_table *t);
 struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
 
+int dm_queue_merge_is_compulsory(struct request_queue *q);
+
 void dm_lock_md_type(struct mapped_device *md);
 void dm_unlock_md_type(struct mapped_device *md);
 void dm_set_md_type(struct mapped_device *md, unsigned type);
index 4427e04540516ef006e08ce8de95bebc23516905..3fa1f3d90ce0e21cb3ba27fa98d3201252f91f8e 100644 (file)
@@ -208,6 +208,49 @@ struct dm_target_callbacks {
 int dm_register_target(struct target_type *t);
 void dm_unregister_target(struct target_type *t);
 
+/*
+ * Target argument parsing.
+ */
+struct dm_arg_set {
+       unsigned argc;
+       char **argv;
+};
+
+/*
+ * The minimum and maximum value of a numeric argument, together with
+ * the error message to use if the number is found to be outside that range.
+ */
+struct dm_arg {
+       unsigned min;
+       unsigned max;
+       char *error;
+};
+
+/*
+ * Validate the next argument, either returning it as *value or, if invalid,
+ * returning -EINVAL and setting *error.
+ */
+int dm_read_arg(struct dm_arg *arg, struct dm_arg_set *arg_set,
+               unsigned *value, char **error);
+
+/*
+ * Process the next argument as the start of a group containing between
+ * arg->min and arg->max further arguments. Either return the size as
+ * *num_args or, if invalid, return -EINVAL and set *error.
+ */
+int dm_read_arg_group(struct dm_arg *arg, struct dm_arg_set *arg_set,
+                     unsigned *num_args, char **error);
+
+/*
+ * Return the current argument and shift to the next.
+ */
+const char *dm_shift_arg(struct dm_arg_set *as);
+
+/*
+ * Move through num_args arguments.
+ */
+void dm_consume_args(struct dm_arg_set *as, unsigned num_args);
+
 /*-----------------------------------------------------------------
  * Functions for creating and manipulating mapped devices.
  * Drop the reference with dm_put when you finish with the object.
index 3708455ee6c38f49cb8d7fd97b040f100a3379aa..0cb8eff76bd6e563999df6c081251c4f8a8ef292 100644 (file)
@@ -267,9 +267,9 @@ enum {
 #define DM_DEV_SET_GEOMETRY    _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR       4
-#define DM_VERSION_MINOR       20
+#define DM_VERSION_MINOR       21
 #define DM_VERSION_PATCHLEVEL  0
-#define DM_VERSION_EXTRA       "-ioctl (2011-02-02)"
+#define DM_VERSION_EXTRA       "-ioctl (2011-07-06)"
 
 /* Status bits */
 #define DM_READONLY_FLAG       (1 << 0) /* In/Out */
index 298d587e349b17169ad1a295645f66e307f42ff4..5e54458e920f36466a29d83ccbe913a03f347a01 100644 (file)
@@ -42,5 +42,20 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
                   unsigned num_dests, struct dm_io_region *dests,
                   unsigned flags, dm_kcopyd_notify_fn fn, void *context);
 
+/*
+ * Prepare a callback and submit it via the kcopyd thread.
+ *
+ * dm_kcopyd_prepare_callback allocates a callback structure and returns it.
+ * It must not be called from interrupt context.
+ * The returned value should be passed into dm_kcopyd_do_callback.
+ *
+ * dm_kcopyd_do_callback submits the callback.
+ * It may be called from interrupt context.
+ * The callback is issued from the kcopyd thread.
+ */
+void *dm_kcopyd_prepare_callback(struct dm_kcopyd_client *kc,
+                                dm_kcopyd_notify_fn fn, void *context);
+void dm_kcopyd_do_callback(void *job, int read_err, unsigned long write_err);
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_DM_KCOPYD_H */