Merge tag 'virtio-next-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 20 Dec 2012 16:37:04 +0000 (08:37 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 20 Dec 2012 16:37:05 +0000 (08:37 -0800)
Pull virtio update from Rusty Russell:
 "Some nice cleanups, and even a patch my wife did as a "live" demo for
  Latinoware 2012.

  There's a slightly non-trivial merge in virtio-net, as we cleaned up
  the virtio add_buf interface while DaveM accepted the mq virtio-net
  patches."

* tag 'virtio-next-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux: (27 commits)
  virtio_console: Add support for remoteproc serial
  virtio_console: Merge struct buffer_token into struct port_buffer
  virtio: add drv_to_virtio to make code clearly
  virtio: use dev_to_virtio wrapper in virtio
  virtio-mmio: Fix irq parsing in command line parameter
  virtio_console: Free buffers from out-queue upon close
  virtio: Convert dev_printk(KERN_<LEVEL> to dev_<level>(
  virtio_console: Use kmalloc instead of kzalloc
  virtio_console: Free buffer if splice fails
  virtio: tools: make it clear that virtqueue_add_buf() no longer returns > 0
  virtio: scsi: make it clear that virtqueue_add_buf() no longer returns > 0
  virtio: rpmsg: make it clear that virtqueue_add_buf() no longer returns > 0
  virtio: net: make it clear that virtqueue_add_buf() no longer returns > 0
  virtio: console: make it clear that virtqueue_add_buf() no longer returns > 0
  virtio: make virtqueue_add_buf() returning 0 on success, not capacity.
  virtio: console: don't rely on virtqueue_add_buf() returning capacity.
  virtio_net: don't rely on virtqueue_add_buf() returning capacity.
  virtio-net: remove unused skb_vnet_hdr->num_sg field
  virtio-net: correct capacity math on ring full
  virtio: move queue_index and num_free fields into core struct virtqueue.
  ...

1  2 
drivers/char/virtio_console.c
drivers/net/virtio_net.c
drivers/scsi/virtio_scsi.c
drivers/virtio/virtio.c
drivers/virtio/virtio_balloon.c
mm/highmem.c
tools/virtio/virtio_test.c

  #include <linux/wait.h>
  #include <linux/workqueue.h>
  #include <linux/module.h>
+ #include <linux/dma-mapping.h>
+ #include <linux/kconfig.h>
  #include "../tty/hvc/hvc_console.h"
  
+ #define is_rproc_enabled IS_ENABLED(CONFIG_REMOTEPROC)
  /*
   * This is a global struct for storing common data for all the devices
   * this driver handles.
@@@ -111,6 -115,21 +115,21 @@@ struct port_buffer 
        size_t len;
        /* offset in the buf from which to consume data */
        size_t offset;
+       /* DMA address of buffer */
+       dma_addr_t dma;
+       /* Device we got DMA memory from */
+       struct device *dev;
+       /* List of pending dma buffers to free */
+       struct list_head list;
+       /* If sgpages == 0 then buf is used */
+       unsigned int sgpages;
+       /* sg is used if spages > 0. sg must be the last in is struct */
+       struct scatterlist sg[0];
  };
  
  /*
@@@ -325,6 -344,11 +344,11 @@@ static bool is_console_port(struct por
        return false;
  }
  
+ static bool is_rproc_serial(const struct virtio_device *vdev)
+ {
+       return is_rproc_enabled && vdev->id.device == VIRTIO_ID_RPROC_SERIAL;
+ }
  static inline bool use_multiport(struct ports_device *portdev)
  {
        /*
        return portdev->vdev->features[0] & (1 << VIRTIO_CONSOLE_F_MULTIPORT);
  }
  
- static void free_buf(struct port_buffer *buf)
+ static DEFINE_SPINLOCK(dma_bufs_lock);
+ static LIST_HEAD(pending_free_dma_bufs);
+ static void free_buf(struct port_buffer *buf, bool can_sleep)
  {
-       kfree(buf->buf);
+       unsigned int i;
+       for (i = 0; i < buf->sgpages; i++) {
+               struct page *page = sg_page(&buf->sg[i]);
+               if (!page)
+                       break;
+               put_page(page);
+       }
+       if (!buf->dev) {
+               kfree(buf->buf);
+       } else if (is_rproc_enabled) {
+               unsigned long flags;
+               /* dma_free_coherent requires interrupts to be enabled. */
+               if (!can_sleep) {
+                       /* queue up dma-buffers to be freed later */
+                       spin_lock_irqsave(&dma_bufs_lock, flags);
+                       list_add_tail(&buf->list, &pending_free_dma_bufs);
+                       spin_unlock_irqrestore(&dma_bufs_lock, flags);
+                       return;
+               }
+               dma_free_coherent(buf->dev, buf->size, buf->buf, buf->dma);
+               /* Release device refcnt and allow it to be freed */
+               put_device(buf->dev);
+       }
        kfree(buf);
  }
  
- static struct port_buffer *alloc_buf(size_t buf_size)
+ static void reclaim_dma_bufs(void)
+ {
+       unsigned long flags;
+       struct port_buffer *buf, *tmp;
+       LIST_HEAD(tmp_list);
+       if (list_empty(&pending_free_dma_bufs))
+               return;
+       /* Create a copy of the pending_free_dma_bufs while holding the lock */
+       spin_lock_irqsave(&dma_bufs_lock, flags);
+       list_cut_position(&tmp_list, &pending_free_dma_bufs,
+                         pending_free_dma_bufs.prev);
+       spin_unlock_irqrestore(&dma_bufs_lock, flags);
+       /* Release the dma buffers, without irqs enabled */
+       list_for_each_entry_safe(buf, tmp, &tmp_list, list) {
+               list_del(&buf->list);
+               free_buf(buf, true);
+       }
+ }
+ static struct port_buffer *alloc_buf(struct virtqueue *vq, size_t buf_size,
+                                    int pages)
  {
        struct port_buffer *buf;
  
-       buf = kmalloc(sizeof(*buf), GFP_KERNEL);
+       reclaim_dma_bufs();
+       /*
+        * Allocate buffer and the sg list. The sg list array is allocated
+        * directly after the port_buffer struct.
+        */
+       buf = kmalloc(sizeof(*buf) + sizeof(struct scatterlist) * pages,
+                     GFP_KERNEL);
        if (!buf)
                goto fail;
-       buf->buf = kzalloc(buf_size, GFP_KERNEL);
+       buf->sgpages = pages;
+       if (pages > 0) {
+               buf->dev = NULL;
+               buf->buf = NULL;
+               return buf;
+       }
+       if (is_rproc_serial(vq->vdev)) {
+               /*
+                * Allocate DMA memory from ancestor. When a virtio
+                * device is created by remoteproc, the DMA memory is
+                * associated with the grandparent device:
+                * vdev => rproc => platform-dev.
+                * The code here would have been less quirky if
+                * DMA_MEMORY_INCLUDES_CHILDREN had been supported
+                * in dma-coherent.c
+                */
+               if (!vq->vdev->dev.parent || !vq->vdev->dev.parent->parent)
+                       goto free_buf;
+               buf->dev = vq->vdev->dev.parent->parent;
+               /* Increase device refcnt to avoid freeing it */
+               get_device(buf->dev);
+               buf->buf = dma_alloc_coherent(buf->dev, buf_size, &buf->dma,
+                                             GFP_KERNEL);
+       } else {
+               buf->dev = NULL;
+               buf->buf = kmalloc(buf_size, GFP_KERNEL);
+       }
        if (!buf->buf)
                goto free_buf;
        buf->len = 0;
@@@ -396,6 -510,8 +510,8 @@@ static int add_inbuf(struct virtqueue *
  
        ret = virtqueue_add_buf(vq, sg, 0, 1, buf, GFP_ATOMIC);
        virtqueue_kick(vq);
+       if (!ret)
+               ret = vq->num_free;
        return ret;
  }
  
@@@ -416,7 -532,7 +532,7 @@@ static void discard_port_data(struct po
                port->stats.bytes_discarded += buf->len - buf->offset;
                if (add_inbuf(port->in_vq, buf) < 0) {
                        err++;
-                       free_buf(buf);
+                       free_buf(buf, false);
                }
                port->inbuf = NULL;
                buf = get_inbuf(port);
@@@ -459,7 -575,7 +575,7 @@@ static ssize_t __send_control_msg(struc
        vq = portdev->c_ovq;
  
        sg_init_one(sg, &cpkt, sizeof(cpkt));
-       if (virtqueue_add_buf(vq, sg, 1, 0, &cpkt, GFP_ATOMIC) >= 0) {
+       if (virtqueue_add_buf(vq, sg, 1, 0, &cpkt, GFP_ATOMIC) == 0) {
                virtqueue_kick(vq);
                while (!virtqueue_get_buf(vq, &len))
                        cpu_relax();
@@@ -476,55 -592,29 +592,29 @@@ static ssize_t send_control_msg(struct 
        return 0;
  }
  
- struct buffer_token {
-       union {
-               void *buf;
-               struct scatterlist *sg;
-       } u;
-       /* If sgpages == 0 then buf is used, else sg is used */
-       unsigned int sgpages;
- };
- static void reclaim_sg_pages(struct scatterlist *sg, unsigned int nrpages)
- {
-       int i;
-       struct page *page;
-       for (i = 0; i < nrpages; i++) {
-               page = sg_page(&sg[i]);
-               if (!page)
-                       break;
-               put_page(page);
-       }
-       kfree(sg);
- }
  
  /* Callers must take the port->outvq_lock */
  static void reclaim_consumed_buffers(struct port *port)
  {
-       struct buffer_token *tok;
+       struct port_buffer *buf;
        unsigned int len;
  
        if (!port->portdev) {
                /* Device has been unplugged.  vqs are already gone. */
                return;
        }
-       while ((tok = virtqueue_get_buf(port->out_vq, &len))) {
-               if (tok->sgpages)
-                       reclaim_sg_pages(tok->u.sg, tok->sgpages);
-               else
-                       kfree(tok->u.buf);
-               kfree(tok);
+       while ((buf = virtqueue_get_buf(port->out_vq, &len))) {
+               free_buf(buf, false);
                port->outvq_full = false;
        }
  }
  
  static ssize_t __send_to_port(struct port *port, struct scatterlist *sg,
                              int nents, size_t in_count,
-                             struct buffer_token *tok, bool nonblock)
+                             void *data, bool nonblock)
  {
        struct virtqueue *out_vq;
-       ssize_t ret;
+       int err;
        unsigned long flags;
        unsigned int len;
  
  
        reclaim_consumed_buffers(port);
  
-       ret = virtqueue_add_buf(out_vq, sg, nents, 0, tok, GFP_ATOMIC);
+       err = virtqueue_add_buf(out_vq, sg, nents, 0, data, GFP_ATOMIC);
  
        /* Tell Host to go! */
        virtqueue_kick(out_vq);
  
-       if (ret < 0) {
+       if (err) {
                in_count = 0;
                goto done;
        }
  
-       if (ret == 0)
+       if (out_vq->num_free == 0)
                port->outvq_full = true;
  
        if (nonblock)
@@@ -572,37 -662,6 +662,6 @@@ done
        return in_count;
  }
  
- static ssize_t send_buf(struct port *port, void *in_buf, size_t in_count,
-                       bool nonblock)
- {
-       struct scatterlist sg[1];
-       struct buffer_token *tok;
-       tok = kmalloc(sizeof(*tok), GFP_ATOMIC);
-       if (!tok)
-               return -ENOMEM;
-       tok->sgpages = 0;
-       tok->u.buf = in_buf;
-       sg_init_one(sg, in_buf, in_count);
-       return __send_to_port(port, sg, 1, in_count, tok, nonblock);
- }
- static ssize_t send_pages(struct port *port, struct scatterlist *sg, int nents,
-                         size_t in_count, bool nonblock)
- {
-       struct buffer_token *tok;
-       tok = kmalloc(sizeof(*tok), GFP_ATOMIC);
-       if (!tok)
-               return -ENOMEM;
-       tok->sgpages = nents;
-       tok->u.sg = sg;
-       return __send_to_port(port, sg, nents, in_count, tok, nonblock);
- }
  /*
   * Give out the data that's requested from the buffer that we have
   * queued up.
@@@ -748,9 -807,10 +807,10 @@@ static ssize_t port_fops_write(struct f
                               size_t count, loff_t *offp)
  {
        struct port *port;
-       char *buf;
+       struct port_buffer *buf;
        ssize_t ret;
        bool nonblock;
+       struct scatterlist sg[1];
  
        /* Userspace could be out to fool us */
        if (!count)
  
        count = min((size_t)(32 * 1024), count);
  
-       buf = kmalloc(count, GFP_KERNEL);
+       buf = alloc_buf(port->out_vq, count, 0);
        if (!buf)
                return -ENOMEM;
  
-       ret = copy_from_user(buf, ubuf, count);
+       ret = copy_from_user(buf->buf, ubuf, count);
        if (ret) {
                ret = -EFAULT;
                goto free_buf;
         * through to the host.
         */
        nonblock = true;
-       ret = send_buf(port, buf, count, nonblock);
+       sg_init_one(sg, buf->buf, count);
+       ret = __send_to_port(port, sg, 1, count, buf, nonblock);
  
        if (nonblock && ret > 0)
                goto out;
  
  free_buf:
-       kfree(buf);
+       free_buf(buf, true);
  out:
        return ret;
  }
@@@ -856,6 -917,7 +917,7 @@@ static ssize_t port_fops_splice_write(s
        struct port *port = filp->private_data;
        struct sg_list sgl;
        ssize_t ret;
+       struct port_buffer *buf;
        struct splice_desc sd = {
                .total_len = len,
                .flags = flags,
                .u.data = &sgl,
        };
  
+       /*
+        * Rproc_serial does not yet support splice. To support splice
+        * pipe_to_sg() must allocate dma-buffers and copy content from
+        * regular pages to dma pages. And alloc_buf and free_buf must
+        * support allocating and freeing such a list of dma-buffers.
+        */
+       if (is_rproc_serial(port->out_vq->vdev))
+               return -EINVAL;
        ret = wait_port_writable(port, filp->f_flags & O_NONBLOCK);
        if (ret < 0)
                return ret;
  
+       buf = alloc_buf(port->out_vq, 0, pipe->nrbufs);
+       if (!buf)
+               return -ENOMEM;
        sgl.n = 0;
        sgl.len = 0;
        sgl.size = pipe->nrbufs;
-       sgl.sg = kmalloc(sizeof(struct scatterlist) * sgl.size, GFP_KERNEL);
-       if (unlikely(!sgl.sg))
-               return -ENOMEM;
+       sgl.sg = buf->sg;
        sg_init_table(sgl.sg, sgl.size);
        ret = __splice_from_pipe(pipe, &sd, pipe_to_sg);
        if (likely(ret > 0))
-               ret = send_pages(port, sgl.sg, sgl.n, sgl.len, true);
+               ret = __send_to_port(port, buf->sg, sgl.n, sgl.len, buf, true);
  
+       if (unlikely(ret <= 0))
+               free_buf(buf, true);
        return ret;
  }
  
@@@ -927,6 -1001,7 +1001,7 @@@ static int port_fops_release(struct ino
        reclaim_consumed_buffers(port);
        spin_unlock_irq(&port->outvq_lock);
  
+       reclaim_dma_bufs();
        /*
         * Locks aren't necessary here as a port can't be opened after
         * unplug, and if a port isn't unplugged, a kref would already
@@@ -1031,6 -1106,7 +1106,7 @@@ static const struct file_operations por
  static int put_chars(u32 vtermno, const char *buf, int count)
  {
        struct port *port;
+       struct scatterlist sg[1];
  
        if (unlikely(early_put_chars))
                return early_put_chars(vtermno, buf, count);
        if (!port)
                return -EPIPE;
  
-       return send_buf(port, (void *)buf, count, false);
+       sg_init_one(sg, buf, count);
+       return __send_to_port(port, sg, 1, count, (void *)buf, false);
  }
  
  /*
@@@ -1076,7 -1153,10 +1153,10 @@@ static void resize_console(struct port 
                return;
  
        vdev = port->portdev->vdev;
-       if (virtio_has_feature(vdev, VIRTIO_CONSOLE_F_SIZE))
+       /* Don't test F_SIZE at all if we're rproc: not a valid feature! */
+       if (!is_rproc_serial(vdev) &&
+           virtio_has_feature(vdev, VIRTIO_CONSOLE_F_SIZE))
                hvc_resize(port->cons.hvc, port->cons.ws);
  }
  
@@@ -1260,7 -1340,7 +1340,7 @@@ static unsigned int fill_queue(struct v
  
        nr_added_bufs = 0;
        do {
-               buf = alloc_buf(PAGE_SIZE);
+               buf = alloc_buf(vq, PAGE_SIZE, 0);
                if (!buf)
                        break;
  
                ret = add_inbuf(vq, buf);
                if (ret < 0) {
                        spin_unlock_irq(lock);
-                       free_buf(buf);
+                       free_buf(buf, true);
                        break;
                }
                nr_added_bufs++;
@@@ -1356,10 -1436,18 +1436,18 @@@ static int add_port(struct ports_devic
                goto free_device;
        }
  
-       /*
-        * If we're not using multiport support, this has to be a console port
-        */
-       if (!use_multiport(port->portdev)) {
+       if (is_rproc_serial(port->portdev->vdev))
+               /*
+                * For rproc_serial assume remote processor is connected.
+                * rproc_serial does not want the console port, only
+                * the generic port implementation.
+                */
+               port->host_connected = true;
+       else if (!use_multiport(port->portdev)) {
+               /*
+                * If we're not using multiport support,
+                * this has to be a console port.
+                */
                err = init_port_console(port);
                if (err)
                        goto free_inbufs;
  
  free_inbufs:
        while ((buf = virtqueue_detach_unused_buf(port->in_vq)))
-               free_buf(buf);
+               free_buf(buf, true);
  free_device:
        device_destroy(pdrvdata.class, port->dev->devt);
  free_cdev:
@@@ -1434,7 -1522,11 +1522,11 @@@ static void remove_port_data(struct por
  
        /* Remove buffers we queued up for the Host to send us data in. */
        while ((buf = virtqueue_detach_unused_buf(port->in_vq)))
-               free_buf(buf);
+               free_buf(buf, true);
+       /* Free pending buffers from the out-queue. */
+       while ((buf = virtqueue_detach_unused_buf(port->out_vq)))
+               free_buf(buf, true);
  }
  
  /*
@@@ -1636,7 -1728,7 +1728,7 @@@ static void control_work_handler(struc
                if (add_inbuf(portdev->c_ivq, buf) < 0) {
                        dev_warn(&portdev->vdev->dev,
                                 "Error adding buffer to queue\n");
-                       free_buf(buf);
+                       free_buf(buf, false);
                }
        }
        spin_unlock(&portdev->cvq_lock);
@@@ -1832,10 -1924,10 +1924,10 @@@ static void remove_controlq_data(struc
                return;
  
        while ((buf = virtqueue_get_buf(portdev->c_ivq, &len)))
-               free_buf(buf);
+               free_buf(buf, true);
  
        while ((buf = virtqueue_detach_unused_buf(portdev->c_ivq)))
-               free_buf(buf);
+               free_buf(buf, true);
  }
  
  /*
   * config space to see how many ports the host has spawned.  We
   * initialize each port found.
   */
 -static int __devinit virtcons_probe(struct virtio_device *vdev)
 +static int virtcons_probe(struct virtio_device *vdev)
  {
        struct ports_device *portdev;
        int err;
  
        multiport = false;
        portdev->config.max_nr_ports = 1;
-       if (virtio_config_val(vdev, VIRTIO_CONSOLE_F_MULTIPORT,
-                             offsetof(struct virtio_console_config,
-                                      max_nr_ports),
-                             &portdev->config.max_nr_ports) == 0)
+       /* Don't test MULTIPORT at all if we're rproc: not a valid feature! */
+       if (!is_rproc_serial(vdev) &&
+           virtio_config_val(vdev, VIRTIO_CONSOLE_F_MULTIPORT,
+                                 offsetof(struct virtio_console_config,
+                                          max_nr_ports),
+                                 &portdev->config.max_nr_ports) == 0) {
                multiport = true;
+       }
  
        err = init_vqs(portdev);
        if (err < 0) {
@@@ -1996,6 -2092,16 +2092,16 @@@ static unsigned int features[] = 
        VIRTIO_CONSOLE_F_MULTIPORT,
  };
  
+ static struct virtio_device_id rproc_serial_id_table[] = {
+ #if IS_ENABLED(CONFIG_REMOTEPROC)
+       { VIRTIO_ID_RPROC_SERIAL, VIRTIO_DEV_ANY_ID },
+ #endif
+       { 0 },
+ };
+ static unsigned int rproc_serial_features[] = {
+ };
  #ifdef CONFIG_PM
  static int virtcons_freeze(struct virtio_device *vdev)
  {
@@@ -2080,6 -2186,20 +2186,20 @@@ static struct virtio_driver virtio_cons
  #endif
  };
  
+ /*
+  * virtio_rproc_serial refers to __devinit function which causes
+  * section mismatch warnings. So use __refdata to silence warnings.
+  */
+ static struct virtio_driver __refdata virtio_rproc_serial = {
+       .feature_table = rproc_serial_features,
+       .feature_table_size = ARRAY_SIZE(rproc_serial_features),
+       .driver.name =  "virtio_rproc_serial",
+       .driver.owner = THIS_MODULE,
+       .id_table =     rproc_serial_id_table,
+       .probe =        virtcons_probe,
+       .remove =       virtcons_remove,
+ };
  static int __init init(void)
  {
        int err;
                pr_err("Error %d registering virtio driver\n", err);
                goto free;
        }
+       err = register_virtio_driver(&virtio_rproc_serial);
+       if (err < 0) {
+               pr_err("Error %d registering virtio rproc serial driver\n",
+                      err);
+               goto unregister;
+       }
        return 0;
+ unregister:
+       unregister_virtio_driver(&virtio_console);
  free:
        if (pdrvdata.debugfs_dir)
                debugfs_remove_recursive(pdrvdata.debugfs_dir);
  
  static void __exit fini(void)
  {
+       reclaim_dma_bufs();
        unregister_virtio_driver(&virtio_console);
+       unregister_virtio_driver(&virtio_rproc_serial);
  
        class_destroy(pdrvdata.class);
        if (pdrvdata.debugfs_dir)
diff --combined drivers/net/virtio_net.c
@@@ -51,51 -51,15 +51,51 @@@ struct virtnet_stats 
        u64 rx_packets;
  };
  
 +/* Internal representation of a send virtqueue */
 +struct send_queue {
 +      /* Virtqueue associated with this send _queue */
 +      struct virtqueue *vq;
 +
 +      /* TX: fragments + linear part + virtio header */
 +      struct scatterlist sg[MAX_SKB_FRAGS + 2];
 +
 +      /* Name of the send queue: output.$index */
 +      char name[40];
 +};
 +
 +/* Internal representation of a receive virtqueue */
 +struct receive_queue {
 +      /* Virtqueue associated with this receive_queue */
 +      struct virtqueue *vq;
 +
 +      struct napi_struct napi;
 +
 +      /* Number of input buffers, and max we've ever had. */
 +      unsigned int num, max;
 +
 +      /* Chain pages by the private ptr. */
 +      struct page *pages;
 +
 +      /* RX: fragments + linear part + virtio header */
 +      struct scatterlist sg[MAX_SKB_FRAGS + 2];
 +
 +      /* Name of this receive queue: input.$index */
 +      char name[40];
 +};
 +
  struct virtnet_info {
        struct virtio_device *vdev;
 -      struct virtqueue *rvq, *svq, *cvq;
 +      struct virtqueue *cvq;
        struct net_device *dev;
 -      struct napi_struct napi;
 +      struct send_queue *sq;
 +      struct receive_queue *rq;
        unsigned int status;
  
 -      /* Number of input buffers, and max we've ever had. */
 -      unsigned int num, max;
 +      /* Max # of queue pairs supported by the device */
 +      u16 max_queue_pairs;
 +
 +      /* # of queue pairs currently used by the driver */
 +      u16 curr_queue_pairs;
  
        /* I like... big packets and I cannot lie! */
        bool big_packets;
        /* Host will merge rx buffers for big packets (shake it! shake it!) */
        bool mergeable_rx_bufs;
  
 +      /* Has control virtqueue */
 +      bool has_cvq;
 +
        /* enable config space updates */
        bool config_enable;
  
        /* Lock for config space updates */
        struct mutex config_lock;
  
 -      /* Chain pages by the private ptr. */
 -      struct page *pages;
 -
 -      /* fragments + linear part + virtio header */
 -      struct scatterlist rx_sg[MAX_SKB_FRAGS + 2];
 -      struct scatterlist tx_sg[MAX_SKB_FRAGS + 2];
 +      /* Does the affinity hint is set for virtqueues? */
 +      bool affinity_hint_set;
  };
  
  struct skb_vnet_hdr {
                struct virtio_net_hdr hdr;
                struct virtio_net_hdr_mrg_rxbuf mhdr;
        };
-       unsigned int num_sg;
  };
  
  struct padded_vnet_hdr {
        char padding[6];
  };
  
 +/* Converting between virtqueue no. and kernel tx/rx queue no.
 + * 0:rx0 1:tx0 2:rx1 3:tx1 ... 2N:rxN 2N+1:txN 2N+2:cvq
 + */
 +static int vq2txq(struct virtqueue *vq)
 +{
 +      return (virtqueue_get_queue_index(vq) - 1) / 2;
 +}
 +
 +static int txq2vq(int txq)
 +{
 +      return txq * 2 + 1;
 +}
 +
 +static int vq2rxq(struct virtqueue *vq)
 +{
 +      return virtqueue_get_queue_index(vq) / 2;
 +}
 +
 +static int rxq2vq(int rxq)
 +{
 +      return rxq * 2;
 +}
 +
  static inline struct skb_vnet_hdr *skb_vnet_hdr(struct sk_buff *skb)
  {
        return (struct skb_vnet_hdr *)skb->cb;
   * private is used to chain pages for big packets, put the whole
   * most recent used list in the beginning for reuse
   */
 -static void give_pages(struct virtnet_info *vi, struct page *page)
 +static void give_pages(struct receive_queue *rq, struct page *page)
  {
        struct page *end;
  
 -      /* Find end of list, sew whole thing into vi->pages. */
 +      /* Find end of list, sew whole thing into vi->rq.pages. */
        for (end = page; end->private; end = (struct page *)end->private);
 -      end->private = (unsigned long)vi->pages;
 -      vi->pages = page;
 +      end->private = (unsigned long)rq->pages;
 +      rq->pages = page;
  }
  
 -static struct page *get_a_page(struct virtnet_info *vi, gfp_t gfp_mask)
 +static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
  {
 -      struct page *p = vi->pages;
 +      struct page *p = rq->pages;
  
        if (p) {
 -              vi->pages = (struct page *)p->private;
 +              rq->pages = (struct page *)p->private;
                /* clear private here, it is used to chain pages */
                p->private = 0;
        } else
        return p;
  }
  
 -static void skb_xmit_done(struct virtqueue *svq)
 +static void skb_xmit_done(struct virtqueue *vq)
  {
 -      struct virtnet_info *vi = svq->vdev->priv;
 +      struct virtnet_info *vi = vq->vdev->priv;
  
        /* Suppress further interrupts. */
 -      virtqueue_disable_cb(svq);
 +      virtqueue_disable_cb(vq);
  
        /* We were probably waiting for more output buffers. */
 -      netif_wake_queue(vi->dev);
 +      netif_wake_subqueue(vi->dev, vq2txq(vq));
  }
  
  static void set_skb_frag(struct sk_buff *skb, struct page *page,
  }
  
  /* Called from bottom half context */
 -static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 +static struct sk_buff *page_to_skb(struct receive_queue *rq,
                                   struct page *page, unsigned int len)
  {
 +      struct virtnet_info *vi = rq->vq->vdev->priv;
        struct sk_buff *skb;
        struct skb_vnet_hdr *hdr;
        unsigned int copy, hdr_len, offset;
         * the case of a broken device.
         */
        if (unlikely(len > MAX_SKB_FRAGS * PAGE_SIZE)) {
 -              if (net_ratelimit())
 -                      pr_debug("%s: too much data\n", skb->dev->name);
 +              net_dbg_ratelimited("%s: too much data\n", skb->dev->name);
                dev_kfree_skb(skb);
                return NULL;
        }
        }
  
        if (page)
 -              give_pages(vi, page);
 +              give_pages(rq, page);
  
        return skb;
  }
  
 -static int receive_mergeable(struct virtnet_info *vi, struct sk_buff *skb)
 +static int receive_mergeable(struct receive_queue *rq, struct sk_buff *skb)
  {
        struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb);
        struct page *page;
                        skb->dev->stats.rx_length_errors++;
                        return -EINVAL;
                }
 -              page = virtqueue_get_buf(vi->rvq, &len);
 +              page = virtqueue_get_buf(rq->vq, &len);
                if (!page) {
                        pr_debug("%s: rx error: %d buffers missing\n",
                                 skb->dev->name, hdr->mhdr.num_buffers);
  
                set_skb_frag(skb, page, 0, &len);
  
 -              --vi->num;
 +              --rq->num;
        }
        return 0;
  }
  
 -static void receive_buf(struct net_device *dev, void *buf, unsigned int len)
 +static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
  {
 -      struct virtnet_info *vi = netdev_priv(dev);
 +      struct virtnet_info *vi = rq->vq->vdev->priv;
 +      struct net_device *dev = vi->dev;
        struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
        struct sk_buff *skb;
        struct page *page;
                pr_debug("%s: short packet %i\n", dev->name, len);
                dev->stats.rx_length_errors++;
                if (vi->mergeable_rx_bufs || vi->big_packets)
 -                      give_pages(vi, buf);
 +                      give_pages(rq, buf);
                else
                        dev_kfree_skb(buf);
                return;
                skb_trim(skb, len);
        } else {
                page = buf;
 -              skb = page_to_skb(vi, page, len);
 +              skb = page_to_skb(rq, page, len);
                if (unlikely(!skb)) {
                        dev->stats.rx_dropped++;
 -                      give_pages(vi, page);
 +                      give_pages(rq, page);
                        return;
                }
                if (vi->mergeable_rx_bufs)
 -                      if (receive_mergeable(vi, skb)) {
 +                      if (receive_mergeable(rq, skb)) {
                                dev_kfree_skb(skb);
                                return;
                        }
                        skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
                        break;
                default:
 -                      if (net_ratelimit())
 -                              printk(KERN_WARNING "%s: bad gso type %u.\n",
 -                                     dev->name, hdr->hdr.gso_type);
 +                      net_warn_ratelimited("%s: bad gso type %u.\n",
 +                                           dev->name, hdr->hdr.gso_type);
                        goto frame_err;
                }
  
  
                skb_shinfo(skb)->gso_size = hdr->hdr.gso_size;
                if (skb_shinfo(skb)->gso_size == 0) {
 -                      if (net_ratelimit())
 -                              printk(KERN_WARNING "%s: zero gso size.\n",
 -                                     dev->name);
 +                      net_warn_ratelimited("%s: zero gso size.\n", dev->name);
                        goto frame_err;
                }
  
@@@ -419,9 -362,8 +418,9 @@@ frame_err
        dev_kfree_skb(skb);
  }
  
 -static int add_recvbuf_small(struct virtnet_info *vi, gfp_t gfp)
 +static int add_recvbuf_small(struct receive_queue *rq, gfp_t gfp)
  {
 +      struct virtnet_info *vi = rq->vq->vdev->priv;
        struct sk_buff *skb;
        struct skb_vnet_hdr *hdr;
        int err;
        skb_put(skb, MAX_PACKET_LEN);
  
        hdr = skb_vnet_hdr(skb);
 -      sg_set_buf(vi->rx_sg, &hdr->hdr, sizeof hdr->hdr);
 +      sg_set_buf(rq->sg, &hdr->hdr, sizeof hdr->hdr);
  
 -      skb_to_sgvec(skb, vi->rx_sg + 1, 0, skb->len);
 +      skb_to_sgvec(skb, rq->sg + 1, 0, skb->len);
  
 -      err = virtqueue_add_buf(vi->rvq, vi->rx_sg, 0, 2, skb, gfp);
 +      err = virtqueue_add_buf(rq->vq, rq->sg, 0, 2, skb, gfp);
        if (err < 0)
                dev_kfree_skb(skb);
  
        return err;
  }
  
 -static int add_recvbuf_big(struct virtnet_info *vi, gfp_t gfp)
 +static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp)
  {
        struct page *first, *list = NULL;
        char *p;
        int i, err, offset;
  
 -      /* page in vi->rx_sg[MAX_SKB_FRAGS + 1] is list tail */
 +      /* page in rq->sg[MAX_SKB_FRAGS + 1] is list tail */
        for (i = MAX_SKB_FRAGS + 1; i > 1; --i) {
 -              first = get_a_page(vi, gfp);
 +              first = get_a_page(rq, gfp);
                if (!first) {
                        if (list)
 -                              give_pages(vi, list);
 +                              give_pages(rq, list);
                        return -ENOMEM;
                }
 -              sg_set_buf(&vi->rx_sg[i], page_address(first), PAGE_SIZE);
 +              sg_set_buf(&rq->sg[i], page_address(first), PAGE_SIZE);
  
                /* chain new page in list head to match sg */
                first->private = (unsigned long)list;
                list = first;
        }
  
 -      first = get_a_page(vi, gfp);
 +      first = get_a_page(rq, gfp);
        if (!first) {
 -              give_pages(vi, list);
 +              give_pages(rq, list);
                return -ENOMEM;
        }
        p = page_address(first);
  
 -      /* vi->rx_sg[0], vi->rx_sg[1] share the same page */
 -      /* a separated vi->rx_sg[0] for virtio_net_hdr only due to QEMU bug */
 -      sg_set_buf(&vi->rx_sg[0], p, sizeof(struct virtio_net_hdr));
 +      /* rq->sg[0], rq->sg[1] share the same page */
 +      /* a separated rq->sg[0] for virtio_net_hdr only due to QEMU bug */
 +      sg_set_buf(&rq->sg[0], p, sizeof(struct virtio_net_hdr));
  
 -      /* vi->rx_sg[1] for data packet, from offset */
 +      /* rq->sg[1] for data packet, from offset */
        offset = sizeof(struct padded_vnet_hdr);
 -      sg_set_buf(&vi->rx_sg[1], p + offset, PAGE_SIZE - offset);
 +      sg_set_buf(&rq->sg[1], p + offset, PAGE_SIZE - offset);
  
        /* chain first in list head */
        first->private = (unsigned long)list;
 -      err = virtqueue_add_buf(vi->rvq, vi->rx_sg, 0, MAX_SKB_FRAGS + 2,
 +      err = virtqueue_add_buf(rq->vq, rq->sg, 0, MAX_SKB_FRAGS + 2,
                                first, gfp);
        if (err < 0)
 -              give_pages(vi, first);
 +              give_pages(rq, first);
  
        return err;
  }
  
 -static int add_recvbuf_mergeable(struct virtnet_info *vi, gfp_t gfp)
 +static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
  {
        struct page *page;
        int err;
  
 -      page = get_a_page(vi, gfp);
 +      page = get_a_page(rq, gfp);
        if (!page)
                return -ENOMEM;
  
 -      sg_init_one(vi->rx_sg, page_address(page), PAGE_SIZE);
 +      sg_init_one(rq->sg, page_address(page), PAGE_SIZE);
  
 -      err = virtqueue_add_buf(vi->rvq, vi->rx_sg, 0, 1, page, gfp);
 +      err = virtqueue_add_buf(rq->vq, rq->sg, 0, 1, page, gfp);
        if (err < 0)
 -              give_pages(vi, page);
 +              give_pages(rq, page);
  
        return err;
  }
   * before we're receiving packets, or from refill_work which is
   * careful to disable receiving (using napi_disable).
   */
 -static bool try_fill_recv(struct virtnet_info *vi, gfp_t gfp)
 +static bool try_fill_recv(struct receive_queue *rq, gfp_t gfp)
  {
 +      struct virtnet_info *vi = rq->vq->vdev->priv;
        int err;
        bool oom;
  
        do {
                if (vi->mergeable_rx_bufs)
 -                      err = add_recvbuf_mergeable(vi, gfp);
 +                      err = add_recvbuf_mergeable(rq, gfp);
                else if (vi->big_packets)
 -                      err = add_recvbuf_big(vi, gfp);
 +                      err = add_recvbuf_big(rq, gfp);
                else
 -                      err = add_recvbuf_small(vi, gfp);
 +                      err = add_recvbuf_small(rq, gfp);
  
                oom = err == -ENOMEM;
-               if (err < 0)
+               if (err)
                        break;
 -              ++vi->num;
 -      } while (vi->rvq->num_free);
 -
 -      if (unlikely(vi->num > vi->max))
 -              vi->max = vi->num;
 -      virtqueue_kick(vi->rvq);
 +              ++rq->num;
-       } while (err > 0);
++      } while (rq->vq->num_free);
 +      if (unlikely(rq->num > rq->max))
 +              rq->max = rq->num;
 +      virtqueue_kick(rq->vq);
        return !oom;
  }
  
  static void skb_recv_done(struct virtqueue *rvq)
  {
        struct virtnet_info *vi = rvq->vdev->priv;
 +      struct receive_queue *rq = &vi->rq[vq2rxq(rvq)];
 +
        /* Schedule NAPI, Suppress further interrupts if successful. */
 -      if (napi_schedule_prep(&vi->napi)) {
 +      if (napi_schedule_prep(&rq->napi)) {
                virtqueue_disable_cb(rvq);
 -              __napi_schedule(&vi->napi);
 +              __napi_schedule(&rq->napi);
        }
  }
  
 -static void virtnet_napi_enable(struct virtnet_info *vi)
 +static void virtnet_napi_enable(struct receive_queue *rq)
  {
 -      napi_enable(&vi->napi);
 +      napi_enable(&rq->napi);
  
        /* If all buffers were filled by other side before we napi_enabled, we
         * won't get another interrupt, so process any outstanding packets
         * now.  virtnet_poll wants re-enable the queue, so we disable here.
         * We synchronize against interrupts via NAPI_STATE_SCHED */
 -      if (napi_schedule_prep(&vi->napi)) {
 -              virtqueue_disable_cb(vi->rvq);
 +      if (napi_schedule_prep(&rq->napi)) {
 +              virtqueue_disable_cb(rq->vq);
                local_bh_disable();
 -              __napi_schedule(&vi->napi);
 +              __napi_schedule(&rq->napi);
                local_bh_enable();
        }
  }
  
  static void refill_work(struct work_struct *work)
  {
 -      struct virtnet_info *vi;
 +      struct virtnet_info *vi =
 +              container_of(work, struct virtnet_info, refill.work);
        bool still_empty;
 +      int i;
  
 -      vi = container_of(work, struct virtnet_info, refill.work);
 -      napi_disable(&vi->napi);
 -      still_empty = !try_fill_recv(vi, GFP_KERNEL);
 -      virtnet_napi_enable(vi);
 +      for (i = 0; i < vi->max_queue_pairs; i++) {
 +              struct receive_queue *rq = &vi->rq[i];
  
 -      /* In theory, this can happen: if we don't get any buffers in
 -       * we will *never* try to fill again. */
 -      if (still_empty)
 -              schedule_delayed_work(&vi->refill, HZ/2);
 +              napi_disable(&rq->napi);
 +              still_empty = !try_fill_recv(rq, GFP_KERNEL);
 +              virtnet_napi_enable(rq);
 +
 +              /* In theory, this can happen: if we don't get any buffers in
 +               * we will *never* try to fill again.
 +               */
 +              if (still_empty)
 +                      schedule_delayed_work(&vi->refill, HZ/2);
 +      }
  }
  
  static int virtnet_poll(struct napi_struct *napi, int budget)
  {
 -      struct virtnet_info *vi = container_of(napi, struct virtnet_info, napi);
 +      struct receive_queue *rq =
 +              container_of(napi, struct receive_queue, napi);
 +      struct virtnet_info *vi = rq->vq->vdev->priv;
        void *buf;
        unsigned int len, received = 0;
  
  again:
        while (received < budget &&
 -             (buf = virtqueue_get_buf(vi->rvq, &len)) != NULL) {
 -              receive_buf(vi->dev, buf, len);
 -              --vi->num;
 +             (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
 +              receive_buf(rq, buf, len);
 +              --rq->num;
                received++;
        }
  
 -      if (vi->num < vi->max / 2) {
 -              if (!try_fill_recv(vi, GFP_ATOMIC))
 +      if (rq->num < rq->max / 2) {
 +              if (!try_fill_recv(rq, GFP_ATOMIC))
                        schedule_delayed_work(&vi->refill, 0);
        }
  
        /* Out of packets? */
        if (received < budget) {
                napi_complete(napi);
 -              if (unlikely(!virtqueue_enable_cb(vi->rvq)) &&
 +              if (unlikely(!virtqueue_enable_cb(rq->vq)) &&
                    napi_schedule_prep(napi)) {
 -                      virtqueue_disable_cb(vi->rvq);
 +                      virtqueue_disable_cb(rq->vq);
                        __napi_schedule(napi);
                        goto again;
                }
        return received;
  }
  
 -static void free_old_xmit_skbs(struct virtnet_info *vi)
 +static int virtnet_open(struct net_device *dev)
 +{
 +      struct virtnet_info *vi = netdev_priv(dev);
 +      int i;
 +
 +      for (i = 0; i < vi->max_queue_pairs; i++) {
 +              /* Make sure we have some buffers: if oom use wq. */
 +              if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
 +                      schedule_delayed_work(&vi->refill, 0);
 +              virtnet_napi_enable(&vi->rq[i]);
 +      }
 +
 +      return 0;
 +}
 +
- static unsigned int free_old_xmit_skbs(struct send_queue *sq)
++static void free_old_xmit_skbs(struct send_queue *sq)
  {
        struct sk_buff *skb;
-       unsigned int len, tot_sgs = 0;
+       unsigned int len;
 +      struct virtnet_info *vi = sq->vq->vdev->priv;
        struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
  
 -      while ((skb = virtqueue_get_buf(vi->svq, &len)) != NULL) {
 +      while ((skb = virtqueue_get_buf(sq->vq, &len)) != NULL) {
                pr_debug("Sent skb %p\n", skb);
  
                u64_stats_update_begin(&stats->tx_syncp);
                stats->tx_packets++;
                u64_stats_update_end(&stats->tx_syncp);
  
-               tot_sgs += skb_vnet_hdr(skb)->num_sg;
                dev_kfree_skb_any(skb);
        }
-       return tot_sgs;
  }
  
 -static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb)
 +static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
  {
        struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb);
        const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
 +      struct virtnet_info *vi = sq->vq->vdev->priv;
+       unsigned num_sg;
  
        pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest);
  
  
        /* Encode metadata header at front. */
        if (vi->mergeable_rx_bufs)
 -              sg_set_buf(vi->tx_sg, &hdr->mhdr, sizeof hdr->mhdr);
 +              sg_set_buf(sq->sg, &hdr->mhdr, sizeof hdr->mhdr);
        else
 -              sg_set_buf(vi->tx_sg, &hdr->hdr, sizeof hdr->hdr);
 +              sg_set_buf(sq->sg, &hdr->hdr, sizeof hdr->hdr);
  
-       hdr->num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1;
-       return virtqueue_add_buf(sq->vq, sq->sg, hdr->num_sg,
 -      num_sg = skb_to_sgvec(skb, vi->tx_sg + 1, 0, skb->len) + 1;
 -      return virtqueue_add_buf(vi->svq, vi->tx_sg, num_sg,
++      num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1;
++      return virtqueue_add_buf(sq->vq, sq->sg, num_sg,
                                 0, skb, GFP_ATOMIC);
  }
  
  static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
  {
        struct virtnet_info *vi = netdev_priv(dev);
-       int capacity;
 +      int qnum = skb_get_queue_mapping(skb);
 +      struct send_queue *sq = &vi->sq[qnum];
+       int err;
  
        /* Free up any pending old buffers before queueing new ones. */
 -      free_old_xmit_skbs(vi);
 +      free_old_xmit_skbs(sq);
  
        /* Try to transmit */
-       capacity = xmit_skb(sq, skb);
-       /* This can happen with OOM and indirect buffers. */
-       if (unlikely(capacity < 0)) {
-               if (likely(capacity == -ENOMEM)) {
-                       if (net_ratelimit())
-                               dev_warn(&dev->dev,
-                                        "TXQ (%d) failure: out of memory\n",
-                                        qnum);
-               } else {
-                       dev->stats.tx_fifo_errors++;
-                       if (net_ratelimit())
-                               dev_warn(&dev->dev,
-                                        "Unexpected TXQ (%d) failure: %d\n",
-                                        qnum, capacity);
-               }
 -      err = xmit_skb(vi, skb);
++      err = xmit_skb(sq, skb);
+       /* This should not happen! */
+       if (unlikely(err)) {
+               dev->stats.tx_fifo_errors++;
+               if (net_ratelimit())
+                       dev_warn(&dev->dev,
 -                               "Unexpected TX queue failure: %d\n", err);
++                               "Unexpected TXQ (%d) queue failure: %d\n", qnum, err);
                dev->stats.tx_dropped++;
                kfree_skb(skb);
                return NETDEV_TX_OK;
        }
 -      virtqueue_kick(vi->svq);
 +      virtqueue_kick(sq->vq);
  
        /* Don't wait up for transmitted skbs to be freed. */
        skb_orphan(skb);
  
        /* Apparently nice girls don't return TX_BUSY; stop the queue
         * before it gets out of hand.  Naturally, this wastes entries. */
-       if (capacity < 2+MAX_SKB_FRAGS) {
 -      if (vi->svq->num_free < 2+MAX_SKB_FRAGS) {
 -              netif_stop_queue(dev);
 -              if (unlikely(!virtqueue_enable_cb_delayed(vi->svq))) {
++      if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
 +              netif_stop_subqueue(dev, qnum);
 +              if (unlikely(!virtqueue_enable_cb_delayed(sq->vq))) {
                        /* More just got used, free them then recheck. */
-                       capacity += free_old_xmit_skbs(sq);
-                       if (capacity >= 2+MAX_SKB_FRAGS) {
 -                      free_old_xmit_skbs(vi);
 -                      if (vi->svq->num_free >= 2+MAX_SKB_FRAGS) {
 -                              netif_start_queue(dev);
 -                              virtqueue_disable_cb(vi->svq);
++                      free_old_xmit_skbs(sq);
++                      if (sq->vq->num_free >= 2+MAX_SKB_FRAGS) {
 +                              netif_start_subqueue(dev, qnum);
 +                              virtqueue_disable_cb(sq->vq);
                        }
                }
        }
@@@ -822,13 -726,23 +812,13 @@@ static struct rtnl_link_stats64 *virtne
  static void virtnet_netpoll(struct net_device *dev)
  {
        struct virtnet_info *vi = netdev_priv(dev);
 +      int i;
  
 -      napi_schedule(&vi->napi);
 +      for (i = 0; i < vi->curr_queue_pairs; i++)
 +              napi_schedule(&vi->rq[i].napi);
  }
  #endif
  
 -static int virtnet_open(struct net_device *dev)
 -{
 -      struct virtnet_info *vi = netdev_priv(dev);
 -
 -      /* Make sure we have some buffers: if oom use wq. */
 -      if (!try_fill_recv(vi, GFP_KERNEL))
 -              schedule_delayed_work(&vi->refill, 0);
 -
 -      virtnet_napi_enable(vi);
 -      return 0;
 -}
 -
  /*
   * Send command via the control virtqueue and check status.  Commands
   * supported by the hypervisor, as indicated by feature bits, should
@@@ -884,39 -798,13 +874,39 @@@ static void virtnet_ack_link_announce(s
        rtnl_unlock();
  }
  
 +static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
 +{
 +      struct scatterlist sg;
 +      struct virtio_net_ctrl_mq s;
 +      struct net_device *dev = vi->dev;
 +
 +      if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
 +              return 0;
 +
 +      s.virtqueue_pairs = queue_pairs;
 +      sg_init_one(&sg, &s, sizeof(s));
 +
 +      if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
 +                                VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg, 1, 0)){
 +              dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n",
 +                       queue_pairs);
 +              return -EINVAL;
 +      } else
 +              vi->curr_queue_pairs = queue_pairs;
 +
 +      return 0;
 +}
 +
  static int virtnet_close(struct net_device *dev)
  {
        struct virtnet_info *vi = netdev_priv(dev);
 +      int i;
  
        /* Make sure refill_work doesn't re-enable napi! */
        cancel_delayed_work_sync(&vi->refill);
 -      napi_disable(&vi->napi);
 +
 +      for (i = 0; i < vi->max_queue_pairs; i++)
 +              napi_disable(&vi->rq[i].napi);
  
        return 0;
  }
@@@ -1023,43 -911,16 +1013,43 @@@ static int virtnet_vlan_rx_kill_vid(str
        return 0;
  }
  
 +static void virtnet_set_affinity(struct virtnet_info *vi, bool set)
 +{
 +      int i;
 +
 +      /* In multiqueue mode, when the number of cpu is equal to the number of
 +       * queue pairs, we let the queue pairs to be private to one cpu by
 +       * setting the affinity hint to eliminate the contention.
 +       */
 +      if ((vi->curr_queue_pairs == 1 ||
 +           vi->max_queue_pairs != num_online_cpus()) && set) {
 +              if (vi->affinity_hint_set)
 +                      set = false;
 +              else
 +                      return;
 +      }
 +
 +      for (i = 0; i < vi->max_queue_pairs; i++) {
 +              int cpu = set ? i : -1;
 +              virtqueue_set_affinity(vi->rq[i].vq, cpu);
 +              virtqueue_set_affinity(vi->sq[i].vq, cpu);
 +      }
 +
 +      if (set)
 +              vi->affinity_hint_set = true;
 +      else
 +              vi->affinity_hint_set = false;
 +}
 +
  static void virtnet_get_ringparam(struct net_device *dev,
                                struct ethtool_ringparam *ring)
  {
        struct virtnet_info *vi = netdev_priv(dev);
  
 -      ring->rx_max_pending = virtqueue_get_vring_size(vi->rvq);
 -      ring->tx_max_pending = virtqueue_get_vring_size(vi->svq);
 +      ring->rx_max_pending = virtqueue_get_vring_size(vi->rq[0].vq);
 +      ring->tx_max_pending = virtqueue_get_vring_size(vi->sq[0].vq);
        ring->rx_pending = ring->rx_max_pending;
        ring->tx_pending = ring->tx_max_pending;
 -
  }
  
  
@@@ -1075,53 -936,10 +1065,53 @@@ static void virtnet_get_drvinfo(struct 
  
  }
  
 +/* TODO: Eliminate OOO packets during switching */
 +static int virtnet_set_channels(struct net_device *dev,
 +                              struct ethtool_channels *channels)
 +{
 +      struct virtnet_info *vi = netdev_priv(dev);
 +      u16 queue_pairs = channels->combined_count;
 +      int err;
 +
 +      /* We don't support separate rx/tx channels.
 +       * We don't allow setting 'other' channels.
 +       */
 +      if (channels->rx_count || channels->tx_count || channels->other_count)
 +              return -EINVAL;
 +
 +      if (queue_pairs > vi->max_queue_pairs)
 +              return -EINVAL;
 +
 +      err = virtnet_set_queues(vi, queue_pairs);
 +      if (!err) {
 +              netif_set_real_num_tx_queues(dev, queue_pairs);
 +              netif_set_real_num_rx_queues(dev, queue_pairs);
 +
 +              virtnet_set_affinity(vi, true);
 +      }
 +
 +      return err;
 +}
 +
 +static void virtnet_get_channels(struct net_device *dev,
 +                               struct ethtool_channels *channels)
 +{
 +      struct virtnet_info *vi = netdev_priv(dev);
 +
 +      channels->combined_count = vi->curr_queue_pairs;
 +      channels->max_combined = vi->max_queue_pairs;
 +      channels->max_other = 0;
 +      channels->rx_count = 0;
 +      channels->tx_count = 0;
 +      channels->other_count = 0;
 +}
 +
  static const struct ethtool_ops virtnet_ethtool_ops = {
        .get_drvinfo = virtnet_get_drvinfo,
        .get_link = ethtool_op_get_link,
        .get_ringparam = virtnet_get_ringparam,
 +      .set_channels = virtnet_set_channels,
 +      .get_channels = virtnet_get_channels,
  };
  
  #define MIN_MTU 68
@@@ -1135,21 -953,6 +1125,21 @@@ static int virtnet_change_mtu(struct ne
        return 0;
  }
  
 +/* To avoid contending a lock hold by a vcpu who would exit to host, select the
 + * txq based on the processor id.
 + * TODO: handle cpu hotplug.
 + */
 +static u16 virtnet_select_queue(struct net_device *dev, struct sk_buff *skb)
 +{
 +      int txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) :
 +                smp_processor_id();
 +
 +      while (unlikely(txq >= dev->real_num_tx_queues))
 +              txq -= dev->real_num_tx_queues;
 +
 +      return txq;
 +}
 +
  static const struct net_device_ops virtnet_netdev = {
        .ndo_open            = virtnet_open,
        .ndo_stop            = virtnet_close,
        .ndo_get_stats64     = virtnet_stats,
        .ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
        .ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
 +      .ndo_select_queue     = virtnet_select_queue,
  #ifdef CONFIG_NET_POLL_CONTROLLER
        .ndo_poll_controller = virtnet_netpoll,
  #endif
@@@ -1197,10 -999,10 +1187,10 @@@ static void virtnet_config_changed_work
  
        if (vi->status & VIRTIO_NET_S_LINK_UP) {
                netif_carrier_on(vi->dev);
 -              netif_wake_queue(vi->dev);
 +              netif_tx_wake_all_queues(vi->dev);
        } else {
                netif_carrier_off(vi->dev);
 -              netif_stop_queue(vi->dev);
 +              netif_tx_stop_all_queues(vi->dev);
        }
  done:
        mutex_unlock(&vi->config_lock);
@@@ -1213,203 -1015,41 +1203,203 @@@ static void virtnet_config_changed(stru
        schedule_work(&vi->config_work);
  }
  
 -static int init_vqs(struct virtnet_info *vi)
 +static void virtnet_free_queues(struct virtnet_info *vi)
  {
 -      struct virtqueue *vqs[3];
 -      vq_callback_t *callbacks[] = { skb_recv_done, skb_xmit_done, NULL};
 -      const char *names[] = { "input", "output", "control" };
 -      int nvqs, err;
 +      kfree(vi->rq);
 +      kfree(vi->sq);
 +}
  
 -      /* We expect two virtqueues, receive then send,
 -       * and optionally control. */
 -      nvqs = virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ) ? 3 : 2;
 +static void free_receive_bufs(struct virtnet_info *vi)
 +{
 +      int i;
  
 -      err = vi->vdev->config->find_vqs(vi->vdev, nvqs, vqs, callbacks, names);
 -      if (err)
 -              return err;
 +      for (i = 0; i < vi->max_queue_pairs; i++) {
 +              while (vi->rq[i].pages)
 +                      __free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
 +      }
 +}
  
 -      vi->rvq = vqs[0];
 -      vi->svq = vqs[1];
 +static void free_unused_bufs(struct virtnet_info *vi)
 +{
 +      void *buf;
 +      int i;
  
 -      if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)) {
 -              vi->cvq = vqs[2];
 +      for (i = 0; i < vi->max_queue_pairs; i++) {
 +              struct virtqueue *vq = vi->sq[i].vq;
 +              while ((buf = virtqueue_detach_unused_buf(vq)) != NULL)
 +                      dev_kfree_skb(buf);
 +      }
  
 +      for (i = 0; i < vi->max_queue_pairs; i++) {
 +              struct virtqueue *vq = vi->rq[i].vq;
 +
 +              while ((buf = virtqueue_detach_unused_buf(vq)) != NULL) {
 +                      if (vi->mergeable_rx_bufs || vi->big_packets)
 +                              give_pages(&vi->rq[i], buf);
 +                      else
 +                              dev_kfree_skb(buf);
 +                      --vi->rq[i].num;
 +              }
 +              BUG_ON(vi->rq[i].num != 0);
 +      }
 +}
 +
 +static void virtnet_del_vqs(struct virtnet_info *vi)
 +{
 +      struct virtio_device *vdev = vi->vdev;
 +
 +      virtnet_set_affinity(vi, false);
 +
 +      vdev->config->del_vqs(vdev);
 +
 +      virtnet_free_queues(vi);
 +}
 +
 +static int virtnet_find_vqs(struct virtnet_info *vi)
 +{
 +      vq_callback_t **callbacks;
 +      struct virtqueue **vqs;
 +      int ret = -ENOMEM;
 +      int i, total_vqs;
 +      const char **names;
 +
 +      /* We expect 1 RX virtqueue followed by 1 TX virtqueue, followed by
 +       * possible N-1 RX/TX queue pairs used in multiqueue mode, followed by
 +       * possible control vq.
 +       */
 +      total_vqs = vi->max_queue_pairs * 2 +
 +                  virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ);
 +
 +      /* Allocate space for find_vqs parameters */
 +      vqs = kzalloc(total_vqs * sizeof(*vqs), GFP_KERNEL);
 +      if (!vqs)
 +              goto err_vq;
 +      callbacks = kmalloc(total_vqs * sizeof(*callbacks), GFP_KERNEL);
 +      if (!callbacks)
 +              goto err_callback;
 +      names = kmalloc(total_vqs * sizeof(*names), GFP_KERNEL);
 +      if (!names)
 +              goto err_names;
 +
 +      /* Parameters for control virtqueue, if any */
 +      if (vi->has_cvq) {
 +              callbacks[total_vqs - 1] = NULL;
 +              names[total_vqs - 1] = "control";
 +      }
 +
 +      /* Allocate/initialize parameters for send/receive virtqueues */
 +      for (i = 0; i < vi->max_queue_pairs; i++) {
 +              callbacks[rxq2vq(i)] = skb_recv_done;
 +              callbacks[txq2vq(i)] = skb_xmit_done;
 +              sprintf(vi->rq[i].name, "input.%d", i);
 +              sprintf(vi->sq[i].name, "output.%d", i);
 +              names[rxq2vq(i)] = vi->rq[i].name;
 +              names[txq2vq(i)] = vi->sq[i].name;
 +      }
 +
 +      ret = vi->vdev->config->find_vqs(vi->vdev, total_vqs, vqs, callbacks,
 +                                       names);
 +      if (ret)
 +              goto err_find;
 +
 +      if (vi->has_cvq) {
 +              vi->cvq = vqs[total_vqs - 1];
                if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
                        vi->dev->features |= NETIF_F_HW_VLAN_FILTER;
        }
 +
 +      for (i = 0; i < vi->max_queue_pairs; i++) {
 +              vi->rq[i].vq = vqs[rxq2vq(i)];
 +              vi->sq[i].vq = vqs[txq2vq(i)];
 +      }
 +
 +      kfree(names);
 +      kfree(callbacks);
 +      kfree(vqs);
 +
 +      return 0;
 +
 +err_find:
 +      kfree(names);
 +err_names:
 +      kfree(callbacks);
 +err_callback:
 +      kfree(vqs);
 +err_vq:
 +      return ret;
 +}
 +
 +static int virtnet_alloc_queues(struct virtnet_info *vi)
 +{
 +      int i;
 +
 +      vi->sq = kzalloc(sizeof(*vi->sq) * vi->max_queue_pairs, GFP_KERNEL);
 +      if (!vi->sq)
 +              goto err_sq;
 +      vi->rq = kzalloc(sizeof(*vi->rq) * vi->max_queue_pairs, GFP_KERNEL);
 +      if (!vi->rq)
 +              goto err_rq;
 +
 +      INIT_DELAYED_WORK(&vi->refill, refill_work);
 +      for (i = 0; i < vi->max_queue_pairs; i++) {
 +              vi->rq[i].pages = NULL;
 +              netif_napi_add(vi->dev, &vi->rq[i].napi, virtnet_poll,
 +                             napi_weight);
 +
 +              sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
 +              sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
 +      }
 +
        return 0;
 +
 +err_rq:
 +      kfree(vi->sq);
 +err_sq:
 +      return -ENOMEM;
 +}
 +
 +static int init_vqs(struct virtnet_info *vi)
 +{
 +      int ret;
 +
 +      /* Allocate send & receive queues */
 +      ret = virtnet_alloc_queues(vi);
 +      if (ret)
 +              goto err;
 +
 +      ret = virtnet_find_vqs(vi);
 +      if (ret)
 +              goto err_free;
 +
 +      virtnet_set_affinity(vi, true);
 +      return 0;
 +
 +err_free:
 +      virtnet_free_queues(vi);
 +err:
 +      return ret;
  }
  
  static int virtnet_probe(struct virtio_device *vdev)
  {
 -      int err;
 +      int i, err;
        struct net_device *dev;
        struct virtnet_info *vi;
 +      u16 max_queue_pairs;
 +
 +      /* Find if host supports multiqueue virtio_net device */
 +      err = virtio_config_val(vdev, VIRTIO_NET_F_MQ,
 +                              offsetof(struct virtio_net_config,
 +                              max_virtqueue_pairs), &max_queue_pairs);
 +
 +      /* We need at least 2 queue's */
 +      if (err || max_queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
 +          max_queue_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX ||
 +          !virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
 +              max_queue_pairs = 1;
  
        /* Allocate ourselves a network device with room for our info */
 -      dev = alloc_etherdev(sizeof(struct virtnet_info));
 +      dev = alloc_etherdev_mq(sizeof(struct virtnet_info), max_queue_pairs);
        if (!dev)
                return -ENOMEM;
  
  
        /* Set up our device-specific information */
        vi = netdev_priv(dev);
 -      netif_napi_add(dev, &vi->napi, virtnet_poll, napi_weight);
        vi->dev = dev;
        vi->vdev = vdev;
        vdev->priv = vi;
 -      vi->pages = NULL;
        vi->stats = alloc_percpu(struct virtnet_stats);
        err = -ENOMEM;
        if (vi->stats == NULL)
                goto free;
  
 -      INIT_DELAYED_WORK(&vi->refill, refill_work);
        mutex_init(&vi->config_lock);
        vi->config_enable = true;
        INIT_WORK(&vi->config_work, virtnet_config_changed_work);
 -      sg_init_table(vi->rx_sg, ARRAY_SIZE(vi->rx_sg));
 -      sg_init_table(vi->tx_sg, ARRAY_SIZE(vi->tx_sg));
  
        /* If we can receive ANY GSO packets, we must allocate large ones. */
        if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
        if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
                vi->mergeable_rx_bufs = true;
  
 +      if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ))
 +              vi->has_cvq = true;
 +
 +      /* Use single tx/rx queue pair as default */
 +      vi->curr_queue_pairs = 1;
 +      vi->max_queue_pairs = max_queue_pairs;
 +
 +      /* Allocate/initialize the rx/tx queues, and invoke find_vqs */
        err = init_vqs(vi);
        if (err)
                goto free_stats;
  
 +      netif_set_real_num_tx_queues(dev, 1);
 +      netif_set_real_num_rx_queues(dev, 1);
 +
        err = register_netdev(dev);
        if (err) {
                pr_debug("virtio_net: registering device failed\n");
        }
  
        /* Last of all, set up some receive buffers. */
 -      try_fill_recv(vi, GFP_KERNEL);
 -
 -      /* If we didn't even get one input buffer, we're useless. */
 -      if (vi->num == 0) {
 -              err = -ENOMEM;
 -              goto unregister;
 +      for (i = 0; i < vi->max_queue_pairs; i++) {
 +              try_fill_recv(&vi->rq[i], GFP_KERNEL);
 +
 +              /* If we didn't even get one input buffer, we're useless. */
 +              if (vi->rq[i].num == 0) {
 +                      free_unused_bufs(vi);
 +                      err = -ENOMEM;
 +                      goto free_recv_bufs;
 +              }
        }
  
        /* Assume link up if device can't report link status,
                netif_carrier_on(dev);
        }
  
 -      pr_debug("virtnet: registered device %s\n", dev->name);
 +      pr_debug("virtnet: registered device %s with %d RX and TX vq's\n",
 +               dev->name, max_queue_pairs);
 +
        return 0;
  
 -unregister:
 +free_recv_bufs:
 +      free_receive_bufs(vi);
        unregister_netdev(dev);
  free_vqs:
 -      vdev->config->del_vqs(vdev);
 +      cancel_delayed_work_sync(&vi->refill);
 +      virtnet_del_vqs(vi);
  free_stats:
        free_percpu(vi->stats);
  free:
        return err;
  }
  
 -static void free_unused_bufs(struct virtnet_info *vi)
 -{
 -      void *buf;
 -      while (1) {
 -              buf = virtqueue_detach_unused_buf(vi->svq);
 -              if (!buf)
 -                      break;
 -              dev_kfree_skb(buf);
 -      }
 -      while (1) {
 -              buf = virtqueue_detach_unused_buf(vi->rvq);
 -              if (!buf)
 -                      break;
 -              if (vi->mergeable_rx_bufs || vi->big_packets)
 -                      give_pages(vi, buf);
 -              else
 -                      dev_kfree_skb(buf);
 -              --vi->num;
 -      }
 -      BUG_ON(vi->num != 0);
 -}
 -
  static void remove_vq_common(struct virtnet_info *vi)
  {
        vi->vdev->config->reset(vi->vdev);
        /* Free unused buffers in both send and recv, if any. */
        free_unused_bufs(vi);
  
 -      vi->vdev->config->del_vqs(vi->vdev);
 +      free_receive_bufs(vi);
  
 -      while (vi->pages)
 -              __free_pages(get_a_page(vi, GFP_KERNEL), 0);
 +      virtnet_del_vqs(vi);
  }
  
 -static void __devexit virtnet_remove(struct virtio_device *vdev)
 +static void virtnet_remove(struct virtio_device *vdev)
  {
        struct virtnet_info *vi = vdev->priv;
  
  static int virtnet_freeze(struct virtio_device *vdev)
  {
        struct virtnet_info *vi = vdev->priv;
 +      int i;
  
        /* Prevent config work handler from accessing the device */
        mutex_lock(&vi->config_lock);
        cancel_delayed_work_sync(&vi->refill);
  
        if (netif_running(vi->dev))
 -              napi_disable(&vi->napi);
 +              for (i = 0; i < vi->max_queue_pairs; i++) {
 +                      napi_disable(&vi->rq[i].napi);
 +                      netif_napi_del(&vi->rq[i].napi);
 +              }
  
        remove_vq_common(vi);
  
  static int virtnet_restore(struct virtio_device *vdev)
  {
        struct virtnet_info *vi = vdev->priv;
 -      int err;
 +      int err, i;
  
        err = init_vqs(vi);
        if (err)
                return err;
  
        if (netif_running(vi->dev))
 -              virtnet_napi_enable(vi);
 +              for (i = 0; i < vi->max_queue_pairs; i++)
 +                      virtnet_napi_enable(&vi->rq[i]);
  
        netif_device_attach(vi->dev);
  
 -      if (!try_fill_recv(vi, GFP_KERNEL))
 -              schedule_delayed_work(&vi->refill, 0);
 +      for (i = 0; i < vi->max_queue_pairs; i++)
 +              if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
 +                      schedule_delayed_work(&vi->refill, 0);
  
        mutex_lock(&vi->config_lock);
        vi->config_enable = true;
        mutex_unlock(&vi->config_lock);
  
 +      virtnet_set_queues(vi, vi->curr_queue_pairs);
 +
        return 0;
  }
  #endif
@@@ -1637,7 -1279,7 +1627,7 @@@ static unsigned int features[] = 
        VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO,
        VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ,
        VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN,
 -      VIRTIO_NET_F_GUEST_ANNOUNCE,
 +      VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ,
  };
  
  static struct virtio_driver virtio_net_driver = {
        .driver.owner = THIS_MODULE,
        .id_table =     id_table,
        .probe =        virtnet_probe,
 -      .remove =       __devexit_p(virtnet_remove),
 +      .remove =       virtnet_remove,
        .config_changed = virtnet_config_changed,
  #ifdef CONFIG_PM
        .freeze =       virtnet_freeze,
@@@ -215,7 -215,7 +215,7 @@@ static void virtscsi_ctrl_done(struct v
  static int virtscsi_kick_event(struct virtio_scsi *vscsi,
                               struct virtio_scsi_event_node *event_node)
  {
-       int ret;
+       int err;
        struct scatterlist sg;
        unsigned long flags;
  
  
        spin_lock_irqsave(&vscsi->event_vq.vq_lock, flags);
  
-       ret = virtqueue_add_buf(vscsi->event_vq.vq, &sg, 0, 1, event_node, GFP_ATOMIC);
-       if (ret >= 0)
+       err = virtqueue_add_buf(vscsi->event_vq.vq, &sg, 0, 1, event_node,
+                               GFP_ATOMIC);
+       if (!err)
                virtqueue_kick(vscsi->event_vq.vq);
  
        spin_unlock_irqrestore(&vscsi->event_vq.vq_lock, flags);
  
-       return ret;
+       return err;
  }
  
  static int virtscsi_kick_event_all(struct virtio_scsi *vscsi)
@@@ -410,22 -411,23 +411,23 @@@ static int virtscsi_kick_cmd(struct vir
  {
        unsigned int out_num, in_num;
        unsigned long flags;
-       int ret;
+       int err;
+       bool needs_kick = false;
  
        spin_lock_irqsave(&tgt->tgt_lock, flags);
        virtscsi_map_cmd(tgt, cmd, &out_num, &in_num, req_size, resp_size);
  
        spin_lock(&vq->vq_lock);
-       ret = virtqueue_add_buf(vq->vq, tgt->sg, out_num, in_num, cmd, gfp);
+       err = virtqueue_add_buf(vq->vq, tgt->sg, out_num, in_num, cmd, gfp);
        spin_unlock(&tgt->tgt_lock);
-       if (ret >= 0)
-               ret = virtqueue_kick_prepare(vq->vq);
+       if (!err)
+               needs_kick = virtqueue_kick_prepare(vq->vq);
  
        spin_unlock_irqrestore(&vq->vq_lock, flags);
  
-       if (ret > 0)
+       if (needs_kick)
                virtqueue_notify(vq->vq);
-       return ret;
+       return err;
  }
  
  static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc)
  
        if (virtscsi_kick_cmd(tgt, &vscsi->req_vq, cmd,
                              sizeof cmd->req.cmd, sizeof cmd->resp.cmd,
-                             GFP_ATOMIC) >= 0)
+                             GFP_ATOMIC) == 0)
                ret = 0;
 +      else
 +              mempool_free(cmd, virtscsi_cmd_pool);
  
  out:
        return ret;
diff --combined drivers/virtio/virtio.c
@@@ -10,33 -10,32 +10,32 @@@ static DEFINE_IDA(virtio_index_ida)
  static ssize_t device_show(struct device *_d,
                           struct device_attribute *attr, char *buf)
  {
-       struct virtio_device *dev = container_of(_d,struct virtio_device,dev);
+       struct virtio_device *dev = dev_to_virtio(_d);
        return sprintf(buf, "0x%04x\n", dev->id.device);
  }
  static ssize_t vendor_show(struct device *_d,
                           struct device_attribute *attr, char *buf)
  {
-       struct virtio_device *dev = container_of(_d,struct virtio_device,dev);
+       struct virtio_device *dev = dev_to_virtio(_d);
        return sprintf(buf, "0x%04x\n", dev->id.vendor);
  }
  static ssize_t status_show(struct device *_d,
                           struct device_attribute *attr, char *buf)
  {
-       struct virtio_device *dev = container_of(_d,struct virtio_device,dev);
+       struct virtio_device *dev = dev_to_virtio(_d);
        return sprintf(buf, "0x%08x\n", dev->config->get_status(dev));
  }
  static ssize_t modalias_show(struct device *_d,
                             struct device_attribute *attr, char *buf)
  {
-       struct virtio_device *dev = container_of(_d,struct virtio_device,dev);
+       struct virtio_device *dev = dev_to_virtio(_d);
        return sprintf(buf, "virtio:d%08Xv%08X\n",
                       dev->id.device, dev->id.vendor);
  }
  static ssize_t features_show(struct device *_d,
                             struct device_attribute *attr, char *buf)
  {
-       struct virtio_device *dev = container_of(_d, struct virtio_device, dev);
+       struct virtio_device *dev = dev_to_virtio(_d);
        unsigned int i;
        ssize_t len = 0;
  
@@@ -71,10 -70,10 +70,10 @@@ static inline int virtio_id_match(cons
  static int virtio_dev_match(struct device *_dv, struct device_driver *_dr)
  {
        unsigned int i;
-       struct virtio_device *dev = container_of(_dv,struct virtio_device,dev);
+       struct virtio_device *dev = dev_to_virtio(_dv);
        const struct virtio_device_id *ids;
  
-       ids = container_of(_dr, struct virtio_driver, driver)->id_table;
+       ids = drv_to_virtio(_dr)->id_table;
        for (i = 0; ids[i].device; i++)
                if (virtio_id_match(dev, &ids[i]))
                        return 1;
@@@ -83,7 -82,7 +82,7 @@@
  
  static int virtio_uevent(struct device *_dv, struct kobj_uevent_env *env)
  {
-       struct virtio_device *dev = container_of(_dv,struct virtio_device,dev);
+       struct virtio_device *dev = dev_to_virtio(_dv);
  
        return add_uevent_var(env, "MODALIAS=virtio:d%08Xv%08X",
                              dev->id.device, dev->id.vendor);
@@@ -98,8 -97,7 +97,7 @@@ void virtio_check_driver_offered_featur
                                         unsigned int fbit)
  {
        unsigned int i;
-       struct virtio_driver *drv = container_of(vdev->dev.driver,
-                                                struct virtio_driver, driver);
+       struct virtio_driver *drv = drv_to_virtio(vdev->dev.driver);
  
        for (i = 0; i < drv->feature_table_size; i++)
                if (drv->feature_table[i] == fbit)
@@@ -111,9 -109,8 +109,8 @@@ EXPORT_SYMBOL_GPL(virtio_check_driver_o
  static int virtio_dev_probe(struct device *_d)
  {
        int err, i;
-       struct virtio_device *dev = container_of(_d,struct virtio_device,dev);
-       struct virtio_driver *drv = container_of(dev->dev.driver,
-                                                struct virtio_driver, driver);
+       struct virtio_device *dev = dev_to_virtio(_d);
+       struct virtio_driver *drv = drv_to_virtio(dev->dev.driver);
        u32 device_features;
  
        /* We have a driver! */
  
  static int virtio_dev_remove(struct device *_d)
  {
-       struct virtio_device *dev = container_of(_d,struct virtio_device,dev);
-       struct virtio_driver *drv = container_of(dev->dev.driver,
-                                                struct virtio_driver, driver);
+       struct virtio_device *dev = dev_to_virtio(_d);
+       struct virtio_driver *drv = drv_to_virtio(dev->dev.driver);
  
        drv->remove(dev);
  
@@@ -225,10 -221,8 +221,10 @@@ EXPORT_SYMBOL_GPL(register_virtio_devic
  
  void unregister_virtio_device(struct virtio_device *dev)
  {
 +      int index = dev->index; /* save for after device release */
 +
        device_unregister(&dev->dev);
 -      ida_simple_remove(&virtio_index_ida, dev->index);
 +      ida_simple_remove(&virtio_index_ida, index);
  }
  EXPORT_SYMBOL_GPL(unregister_virtio_device);
  
  #include <linux/delay.h>
  #include <linux/slab.h>
  #include <linux/module.h>
 +#include <linux/balloon_compaction.h>
  
  /*
   * Balloon device works in 4K page units.  So each page is pointed to by
   * multiple balloon pages.  All memory counters in this driver are in balloon
   * page units.
   */
 -#define VIRTIO_BALLOON_PAGES_PER_PAGE (PAGE_SIZE >> VIRTIO_BALLOON_PFN_SHIFT)
 +#define VIRTIO_BALLOON_PAGES_PER_PAGE (unsigned)(PAGE_SIZE >> VIRTIO_BALLOON_PFN_SHIFT)
 +#define VIRTIO_BALLOON_ARRAY_PFNS_MAX 256
  
  struct virtio_balloon
  {
        /* Number of balloon pages we've told the Host we're not using. */
        unsigned int num_pages;
        /*
 -       * The pages we've told the Host we're not using.
 +       * The pages we've told the Host we're not using are enqueued
 +       * at vb_dev_info->pages list.
         * Each page on this list adds VIRTIO_BALLOON_PAGES_PER_PAGE
         * to num_pages above.
         */
 -      struct list_head pages;
 +      struct balloon_dev_info *vb_dev_info;
 +
 +      /* Synchronize access/update to this struct virtio_balloon elements */
 +      struct mutex balloon_lock;
  
        /* The array of pfns we tell the Host about. */
        unsigned int num_pfns;
 -      u32 pfns[256];
 +      u32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX];
  
        /* Memory statistics */
        int need_stats_update;
@@@ -128,21 -122,17 +128,20 @@@ static void set_page_pfns(u32 pfns[], s
  
  static void fill_balloon(struct virtio_balloon *vb, size_t num)
  {
 +      struct balloon_dev_info *vb_dev_info = vb->vb_dev_info;
 +
        /* We can only do one array worth at a time. */
        num = min(num, ARRAY_SIZE(vb->pfns));
  
 +      mutex_lock(&vb->balloon_lock);
        for (vb->num_pfns = 0; vb->num_pfns < num;
             vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
 -              struct page *page = alloc_page(GFP_HIGHUSER | __GFP_NORETRY |
 -                                      __GFP_NOMEMALLOC | __GFP_NOWARN);
 +              struct page *page = balloon_page_enqueue(vb_dev_info);
 +
                if (!page) {
-                       if (printk_ratelimit())
-                               dev_printk(KERN_INFO, &vb->vdev->dev,
-                                          "Out of puff! Can't get %u pages\n",
-                                          VIRTIO_BALLOON_PAGES_PER_PAGE);
+                       dev_info_ratelimited(&vb->vdev->dev,
 -                                           "Out of puff! Can't get %zu pages\n",
 -                                           num);
++                                           "Out of puff! Can't get %u pages\n",
++                                           VIRTIO_BALLOON_PAGES_PER_PAGE);
                        /* Sleep for at least 1/5 of a second before retry. */
                        msleep(200);
                        break;
                set_page_pfns(vb->pfns + vb->num_pfns, page);
                vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE;
                totalram_pages--;
 -              list_add(&page->lru, &vb->pages);
        }
  
 -      /* Didn't get any?  Oh well. */
 -      if (vb->num_pfns == 0)
 -              return;
 -
 -      tell_host(vb, vb->inflate_vq);
 +      /* Did we get any? */
 +      if (vb->num_pfns != 0)
 +              tell_host(vb, vb->inflate_vq);
 +      mutex_unlock(&vb->balloon_lock);
  }
  
  static void release_pages_by_pfn(const u32 pfns[], unsigned int num)
  
        /* Find pfns pointing at start of each page, get pages and free them. */
        for (i = 0; i < num; i += VIRTIO_BALLOON_PAGES_PER_PAGE) {
 -              __free_page(balloon_pfn_to_page(pfns[i]));
 +              balloon_page_free(balloon_pfn_to_page(pfns[i]));
                totalram_pages++;
        }
  }
  static void leak_balloon(struct virtio_balloon *vb, size_t num)
  {
        struct page *page;
 +      struct balloon_dev_info *vb_dev_info = vb->vb_dev_info;
  
        /* We can only do one array worth at a time. */
        num = min(num, ARRAY_SIZE(vb->pfns));
  
 +      mutex_lock(&vb->balloon_lock);
        for (vb->num_pfns = 0; vb->num_pfns < num;
             vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
 -              page = list_first_entry(&vb->pages, struct page, lru);
 -              list_del(&page->lru);
 +              page = balloon_page_dequeue(vb_dev_info);
 +              if (!page)
 +                      break;
                set_page_pfns(vb->pfns + vb->num_pfns, page);
                vb->num_pages -= VIRTIO_BALLOON_PAGES_PER_PAGE;
        }
         * is true, we *have* to do it in this order
         */
        tell_host(vb, vb->deflate_vq);
 +      mutex_unlock(&vb->balloon_lock);
        release_pages_by_pfn(vb->pfns, vb->num_pfns);
  }
  
@@@ -350,84 -338,9 +349,84 @@@ static int init_vqs(struct virtio_ballo
        return 0;
  }
  
 +static const struct address_space_operations virtio_balloon_aops;
 +#ifdef CONFIG_BALLOON_COMPACTION
 +/*
 + * virtballoon_migratepage - perform the balloon page migration on behalf of
 + *                         a compation thread.     (called under page lock)
 + * @mapping: the page->mapping which will be assigned to the new migrated page.
 + * @newpage: page that will replace the isolated page after migration finishes.
 + * @page   : the isolated (old) page that is about to be migrated to newpage.
 + * @mode   : compaction mode -- not used for balloon page migration.
 + *
 + * After a ballooned page gets isolated by compaction procedures, this is the
 + * function that performs the page migration on behalf of a compaction thread
 + * The page migration for virtio balloon is done in a simple swap fashion which
 + * follows these two macro steps:
 + *  1) insert newpage into vb->pages list and update the host about it;
 + *  2) update the host about the old page removed from vb->pages list;
 + *
 + * This function preforms the balloon page migration task.
 + * Called through balloon_mapping->a_ops->migratepage
 + */
 +int virtballoon_migratepage(struct address_space *mapping,
 +              struct page *newpage, struct page *page, enum migrate_mode mode)
 +{
 +      struct balloon_dev_info *vb_dev_info = balloon_page_device(page);
 +      struct virtio_balloon *vb;
 +      unsigned long flags;
 +
 +      BUG_ON(!vb_dev_info);
 +
 +      vb = vb_dev_info->balloon_device;
 +
 +      /*
 +       * In order to avoid lock contention while migrating pages concurrently
 +       * to leak_balloon() or fill_balloon() we just give up the balloon_lock
 +       * this turn, as it is easier to retry the page migration later.
 +       * This also prevents fill_balloon() getting stuck into a mutex
 +       * recursion in the case it ends up triggering memory compaction
 +       * while it is attempting to inflate the ballon.
 +       */
 +      if (!mutex_trylock(&vb->balloon_lock))
 +              return -EAGAIN;
 +
 +      /* balloon's page migration 1st step  -- inflate "newpage" */
 +      spin_lock_irqsave(&vb_dev_info->pages_lock, flags);
 +      balloon_page_insert(newpage, mapping, &vb_dev_info->pages);
 +      vb_dev_info->isolated_pages--;
 +      spin_unlock_irqrestore(&vb_dev_info->pages_lock, flags);
 +      vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE;
 +      set_page_pfns(vb->pfns, newpage);
 +      tell_host(vb, vb->inflate_vq);
 +
 +      /*
 +       * balloon's page migration 2nd step -- deflate "page"
 +       *
 +       * It's safe to delete page->lru here because this page is at
 +       * an isolated migration list, and this step is expected to happen here
 +       */
 +      balloon_page_delete(page);
 +      vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE;
 +      set_page_pfns(vb->pfns, page);
 +      tell_host(vb, vb->deflate_vq);
 +
 +      mutex_unlock(&vb->balloon_lock);
 +
 +      return MIGRATEPAGE_BALLOON_SUCCESS;
 +}
 +
 +/* define the balloon_mapping->a_ops callback to allow balloon page migration */
 +static const struct address_space_operations virtio_balloon_aops = {
 +                      .migratepage = virtballoon_migratepage,
 +};
 +#endif /* CONFIG_BALLOON_COMPACTION */
 +
  static int virtballoon_probe(struct virtio_device *vdev)
  {
        struct virtio_balloon *vb;
 +      struct address_space *vb_mapping;
 +      struct balloon_dev_info *vb_devinfo;
        int err;
  
        vdev->priv = vb = kmalloc(sizeof(*vb), GFP_KERNEL);
                goto out;
        }
  
 -      INIT_LIST_HEAD(&vb->pages);
        vb->num_pages = 0;
 +      mutex_init(&vb->balloon_lock);
        init_waitqueue_head(&vb->config_change);
        init_waitqueue_head(&vb->acked);
        vb->vdev = vdev;
        vb->need_stats_update = 0;
  
 +      vb_devinfo = balloon_devinfo_alloc(vb);
 +      if (IS_ERR(vb_devinfo)) {
 +              err = PTR_ERR(vb_devinfo);
 +              goto out_free_vb;
 +      }
 +
 +      vb_mapping = balloon_mapping_alloc(vb_devinfo,
 +                                         (balloon_compaction_check()) ?
 +                                         &virtio_balloon_aops : NULL);
 +      if (IS_ERR(vb_mapping)) {
 +              /*
 +               * IS_ERR(vb_mapping) && PTR_ERR(vb_mapping) == -EOPNOTSUPP
 +               * This means !CONFIG_BALLOON_COMPACTION, otherwise we get off.
 +               */
 +              err = PTR_ERR(vb_mapping);
 +              if (err != -EOPNOTSUPP)
 +                      goto out_free_vb_devinfo;
 +      }
 +
 +      vb->vb_dev_info = vb_devinfo;
 +
        err = init_vqs(vb);
        if (err)
 -              goto out_free_vb;
 +              goto out_free_vb_mapping;
  
        vb->thread = kthread_run(balloon, vb, "vballoon");
        if (IS_ERR(vb->thread)) {
  
  out_del_vqs:
        vdev->config->del_vqs(vdev);
 +out_free_vb_mapping:
 +      balloon_mapping_free(vb_mapping);
 +out_free_vb_devinfo:
 +      balloon_devinfo_free(vb_devinfo);
  out_free_vb:
        kfree(vb);
  out:
@@@ -507,8 -395,6 +506,8 @@@ static void __devexit virtballoon_remov
  
        kthread_stop(vb->thread);
        remove_common(vb);
 +      balloon_mapping_free(vb->vb_dev_info->mapping);
 +      balloon_devinfo_free(vb->vb_dev_info);
        kfree(vb);
  }
  
diff --combined mm/highmem.c
@@@ -98,13 -98,14 +98,14 @@@ struct page *kmap_to_page(void *vaddr
  {
        unsigned long addr = (unsigned long)vaddr;
  
 -      if (addr >= PKMAP_ADDR(0) && addr <= PKMAP_ADDR(LAST_PKMAP)) {
 -              int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT;
 +      if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) {
 +              int i = PKMAP_NR(addr);
                return pte_page(pkmap_page_table[i]);
        }
  
        return virt_to_page(addr);
  }
+ EXPORT_SYMBOL(kmap_to_page);
  
  static void flush_all_zero_pkmaps(void)
  {
                 * So no dangers, even with speculative execution.
                 */
                page = pte_page(pkmap_page_table[i]);
 -              pte_clear(&init_mm, (unsigned long)page_address(page),
 -                        &pkmap_page_table[i]);
 +              pte_clear(&init_mm, PKMAP_ADDR(i), &pkmap_page_table[i]);
  
                set_page_address(page, NULL);
                need_flush = 1;
@@@ -323,7 -325,11 +324,7 @@@ struct page_address_map 
        struct list_head list;
  };
  
 -/*
 - * page_address_map freelist, allocated from page_address_maps.
 - */
 -static struct list_head page_address_pool;    /* freelist */
 -static spinlock_t pool_lock;                  /* protects page_address_pool */
 +static struct page_address_map page_address_maps[LAST_PKMAP];
  
  /*
   * Hash table bucket
@@@ -388,7 -394,14 +389,7 @@@ void set_page_address(struct page *page
  
        pas = page_slot(page);
        if (virtual) {          /* Add */
 -              BUG_ON(list_empty(&page_address_pool));
 -
 -              spin_lock_irqsave(&pool_lock, flags);
 -              pam = list_entry(page_address_pool.next,
 -                              struct page_address_map, list);
 -              list_del(&pam->list);
 -              spin_unlock_irqrestore(&pool_lock, flags);
 -
 +              pam = &page_address_maps[PKMAP_NR((unsigned long)virtual)];
                pam->page = page;
                pam->virtual = virtual;
  
                        if (pam->page == page) {
                                list_del(&pam->list);
                                spin_unlock_irqrestore(&pas->lock, flags);
 -                              spin_lock_irqsave(&pool_lock, flags);
 -                              list_add_tail(&pam->list, &page_address_pool);
 -                              spin_unlock_irqrestore(&pool_lock, flags);
                                goto done;
                        }
                }
@@@ -410,14 -426,20 +411,14 @@@ done
        return;
  }
  
 -static struct page_address_map page_address_maps[LAST_PKMAP];
 -
  void __init page_address_init(void)
  {
        int i;
  
 -      INIT_LIST_HEAD(&page_address_pool);
 -      for (i = 0; i < ARRAY_SIZE(page_address_maps); i++)
 -              list_add(&page_address_maps[i].list, &page_address_pool);
        for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) {
                INIT_LIST_HEAD(&page_address_htable[i].lh);
                spin_lock_init(&page_address_htable[i].lock);
        }
 -      spin_lock_init(&pool_lock);
  }
  
  #endif        /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
@@@ -164,7 -164,7 +164,7 @@@ static void run_test(struct vdev_info *
                                r = virtqueue_add_buf(vq->vq, &sl, 1, 0,
                                                      dev->buf + started,
                                                      GFP_ATOMIC);
-                               if (likely(r >= 0)) {
+                               if (likely(r == 0)) {
                                        ++started;
                                        virtqueue_kick(vq->vq);
                                }
                                r = 0;
                        }
  
-               } while (r >= 0);
+               } while (r == 0);
                if (completed == completed_before)
                        ++spurious;
                assert(completed <= bufs);
@@@ -232,7 -232,7 +232,7 @@@ const struct option longopts[] = 
        }
  };
  
 -static void help()
 +static void help(void)
  {
        fprintf(stderr, "Usage: virtio_test [--help]"
                " [--no-indirect]"