[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-users] Re: [Xen-devel] VM disk I/O limit patch



On Tue, 21 Jun 2011 09:33:37 -0400
Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx> wrote:

> On Tue, Jun 21, 2011 at 04:29:35PM +0800, Andrew Xu wrote:
> > Hi all,
> > 
> > I add a blkback QoS patch.
> 
> What tree is this against? 
This patch is based on suse11.sp1(2.6.32) xen-blkback source. 
(2.6.18 "Xenlinux" based source trees?)

> There is a xen-blkback in 3.0-rc4, can you rebase
> it against that please.
> 
Ok, I will rebase it.

> What is the patch solving? 
> 
With this path, you can set different speed I/O for different VM disk.
For example, I set vm17-disk1 4MKB/s 
                   vm17-disk2 1MKB/s 
                   vm18-disk3 3MKB/s 
I/O speed, by writing follow xenstore key-values.
        /local/domain/17/device/vbd/768/tokens-rate = "4096"
        /local/domain/17/device/vbd/2048/tokens-rate = "1024"
        /local/domain/18/device/vbd/768/tokens-rate = "3096"

> Why can't it be done with dm-ioband?
Of cause, I/O speed limit also can be done with dm-ioband.
But with my patch, there is no need to load dm-ioband any more.
This patch do speed-limit more close disk, more lightweight.


How it doing?
1) It record how many sectors(512Bytes) have submit to disk in 1 second.
2) If it exceed 2*tokens-rate, then sleep (1 - time passed) seconds.
3) Next second come, going to 1)

> > You can config(dynamic/static) different I/O speed for different VM disk
> > by this patch.
> > 
> > ----------------------------------------------------------------------------
> > 
> > diff -urNp blkback/blkback.c blkback-qos/blkback.c
> > --- blkback/blkback.c       2011-06-22 07:54:19.000000000 +0800
> > +++ blkback-qos/blkback.c   2011-06-22 07:53:18.000000000 +0800
> > @@ -44,6 +44,11 @@
> >  #include <asm/hypervisor.h>
> >  #include "common.h"
> >  
> > +#undef DPRINTK
> > +#define DPRINTK(fmt, args...)                              \
> > +   printk("blkback/blkback (%s:%d) " fmt ".\n",    \
> > +            __FUNCTION__, __LINE__, ##args)
> > +
> >  /*
> >   * These are rather arbitrary. They are fairly large because adjacent 
> > requests
> >   * pulled from a communication ring are quite likely to end up being part 
> > of
> > @@ -110,7 +115,8 @@ static inline unsigned long vaddr(pendin
> >  static int do_block_io_op(blkif_t *blkif);
> >  static int dispatch_rw_block_io(blkif_t *blkif,
> >                              blkif_request_t *req,
> > -                            pending_req_t *pending_req);
> > +                            pending_req_t *pending_req,
> > +                            int *done_nr_sects);
> >  static void make_response(blkif_t *blkif, u64 id,
> >                       unsigned short op, int st);
> >  
> > @@ -206,10 +212,20 @@ static void print_stats(blkif_t *blkif)
> >     blkif->st_pk_req = 0;
> >  }
> >  
> > +static void refill_reqcount(blkif_t *blkif)
> > +{
> > +   blkif->reqtime = jiffies + msecs_to_jiffies(1000);
> > +   blkif->reqcount = blkif->reqrate;
> > +   if (blkif->reqcount < blkif->reqmin)
> > +           blkif->reqcount = blkif->reqmin;
> > +}
> > +
> >  int blkif_schedule(void *arg)
> >  {
> >     blkif_t *blkif = arg;
> >     struct vbd *vbd = &blkif->vbd;
> > +   int     ret = 0;
> > +   struct timeval cur_time;
> >  
> >     blkif_get(blkif);
> >  
> > @@ -232,12 +248,34 @@ int blkif_schedule(void *arg)
> >             blkif->waiting_reqs = 0;
> >             smp_mb(); /* clear flag *before* checking for work */
> >  
> > -           if (do_block_io_op(blkif))
> > +           ret = do_block_io_op(blkif);
> > +           if (ret)
> >                     blkif->waiting_reqs = 1;
> >             unplug_queue(blkif);
> >  
> > +           if(blkif->reqmin){
> > +                   if(2 == ret && (blkif->reqtime > jiffies)){
> > +                           jiffies_to_timeval(jiffies, &cur_time);
> > +                           if(log_stats && (cur_time.tv_sec % 10 ==1 ))
> > +                                   printk(KERN_DEBUG "%s: going to sleep 
> > %d millsecs(rate=%d)\n",
> > +                                                   current->comm,
> > +                                                   
> > jiffies_to_msecs(blkif->reqtime - jiffies),
> > +                                                   blkif->reqrate);
> > +                           
> > +                           set_current_state(TASK_INTERRUPTIBLE);
> > +                           schedule_timeout(blkif->reqtime - jiffies);
> > +                           
> > +                           if(log_stats && (cur_time.tv_sec % 10 ==1 ))
> > +                                   printk(KERN_DEBUG "%s: sleep 
> > end(rate=%d)\n",
> > +                                                   
> > current->comm,blkif->reqrate);
> > +                   }
> > +                   if (time_after(jiffies, blkif->reqtime))
> > +                           refill_reqcount(blkif);
> > +           }
> > +
> >             if (log_stats && time_after(jiffies, blkif->st_print))
> >                     print_stats(blkif);
> > +           
> >     }
> >  
> >     if (log_stats)
> > @@ -306,7 +344,6 @@ irqreturn_t blkif_be_int(int irq, void *
> >  /******************************************************************
> >   * DOWNWARD CALLS -- These interface with the block-device layer proper.
> >   */
> > -
> >  static int do_block_io_op(blkif_t *blkif)
> >  {
> >     blkif_back_rings_t *blk_rings = &blkif->blk_rings;
> > @@ -314,15 +351,27 @@ static int do_block_io_op(blkif_t *blkif
> >     pending_req_t *pending_req;
> >     RING_IDX rc, rp;
> >     int more_to_do = 0, ret;
> > +   static int last_done_nr_sects = 0;      
> >  
> >     rc = blk_rings->common.req_cons;
> >     rp = blk_rings->common.sring->req_prod;
> >     rmb(); /* Ensure we see queued requests up to 'rp'. */
> > +   
> > +   if (blkif->reqmin && blkif->reqcount <= 0)
> > +           return (rc != rp) ? 2 : 0;
> >  
> >     while ((rc != rp) || (blkif->is_suspended_req)) {
> >  
> >             if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
> >                     break;
> > +           
> > +           if(blkif->reqmin){
> > +                   blkif->reqcount -= last_done_nr_sects;
> > +                   if (blkif->reqcount <= 0) {
> > +                           more_to_do = 2;
> > +                           break;
> > +                   }
> > +           }
> >  
> >             if (kthread_should_stop()) {
> >                     more_to_do = 1;
> > @@ -367,14 +416,14 @@ handle_request:
> >             switch (req.operation) {
> >             case BLKIF_OP_READ:
> >                     blkif->st_rd_req++;
> > -                   ret = dispatch_rw_block_io(blkif, &req, pending_req); 
> > +                   ret = dispatch_rw_block_io(blkif, &req, 
> > pending_req,&last_done_nr_sects); 
> >                     break;
> >             case BLKIF_OP_WRITE_BARRIER:
> >                     blkif->st_br_req++;
> >                     /* fall through */
> >             case BLKIF_OP_WRITE:
> >                     blkif->st_wr_req++;
> > -                   ret = dispatch_rw_block_io(blkif, &req, pending_req);
> > +                   ret = dispatch_rw_block_io(blkif, &req, 
> > pending_req,&last_done_nr_sects);
> >                     break;
> >             case BLKIF_OP_PACKET:
> >                     DPRINTK("error: block operation BLKIF_OP_PACKET not 
> > implemented\n");
> > @@ -412,9 +461,29 @@ handle_request:
> >     return more_to_do;
> >  }
> >  
> > +static char* operation2str(int operation)
> > +{
> > +   char* ret_str = NULL;
> > +   switch (operation) {
> > +   case BLKIF_OP_READ:
> > +           ret_str = "READ";
> > +           break;
> > +   case BLKIF_OP_WRITE:
> > +           ret_str = "WRITE";
> > +           break;
> > +   case BLKIF_OP_WRITE_BARRIER:
> > +           ret_str = "WRITE_BARRIER";
> > +           break;
> > +   default:
> > +           ret_str = "0";
> > +   }
> > +   return ret_str;
> > +}
> > +
> >  static int dispatch_rw_block_io(blkif_t *blkif,
> >                              blkif_request_t *req,
> > -                            pending_req_t *pending_req)
> > +                            pending_req_t *pending_req,
> > +                            int *done_nr_sects)
> >  {
> >     extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
> >     struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
> > @@ -426,6 +495,9 @@ static int dispatch_rw_block_io(blkif_t
> >     struct bio *bio = NULL;
> >     int ret, i;
> >     int operation;
> > +   struct timeval cur_time;
> > +
> > +   *done_nr_sects = 0;
> >  
> >     switch (req->operation) {
> >     case BLKIF_OP_READ:
> > @@ -582,6 +654,12 @@ static int dispatch_rw_block_io(blkif_t
> >     else if (operation == WRITE || operation == WRITE_BARRIER)
> >             blkif->st_wr_sect += preq.nr_sects;
> >  
> > +   *done_nr_sects = preq.nr_sects;
> > +   jiffies_to_timeval(jiffies, &cur_time);
> > +   if ((log_stats == 2) && (cur_time.tv_sec % 10 ==1 ))
> > +           printk(KERN_DEBUG "  operation=%s sects=%d\n",
> > +                   operation2str(req->operation),preq.nr_sects);
> > +
> >     return 0;
> >  
> >   fail_flush:
> > @@ -695,6 +773,8 @@ static int __init blkif_init(void)
> >  
> >     blkif_xenbus_init();
> >  
> > +   DPRINTK("blkif_inited\n");
> > +
> >     return 0;
> >  
> >   out_of_memory:
> > diff -urNp blkback/cdrom.c blkback-qos/cdrom.c
> > --- blkback/cdrom.c 2010-05-20 18:07:00.000000000 +0800
> > +++ blkback-qos/cdrom.c     2011-06-22 07:34:50.000000000 +0800
> > @@ -35,9 +35,9 @@
> >  #include "common.h"
> >  
> >  #undef DPRINTK
> > -#define DPRINTK(_f, _a...)                 \
> > -   printk("(%s() file=%s, line=%d) " _f "\n",      \
> > -            __PRETTY_FUNCTION__, __FILE__ , __LINE__ , ##_a )
> > +#define DPRINTK(fmt, args...)                              \
> > +   printk("blkback/cdrom (%s:%d) " fmt ".\n",      \
> > +            __FUNCTION__, __LINE__, ##args)
> >  
> >  
> >  #define MEDIA_PRESENT "media-present"
> > diff -urNp blkback/common.h blkback-qos/common.h
> > --- blkback/common.h        2010-05-20 18:07:00.000000000 +0800
> > +++ blkback-qos/common.h    2011-06-22 07:34:50.000000000 +0800
> > @@ -100,8 +100,17 @@ typedef struct blkif_st {
> >  
> >     grant_handle_t shmem_handle;
> >     grant_ref_t    shmem_ref;
> > +
> > +   /* qos information */
> > +   unsigned long   reqtime;
> > +   int    reqcount;
> > +   int    reqmin;
> > +   int    reqrate; 
> > +
> >  } blkif_t;
> >  
> > +#define VBD_QOS_MIN_RATE_LIMIT                     2*1024          /*      
> > 1MBs    */
> > +
> >  struct backend_info
> >  {
> >     struct xenbus_device *dev;
> > @@ -111,6 +120,8 @@ struct backend_info
> >     unsigned major;
> >     unsigned minor;
> >     char *mode;
> > +   struct xenbus_watch rate_watch;
> > +   int have_rate_watch; 
> >  };
> >  
> >  blkif_t *blkif_alloc(domid_t domid);
> > diff -urNp blkback/vbd.c blkback-qos/vbd.c
> > --- blkback/vbd.c   2010-05-20 18:07:00.000000000 +0800
> > +++ blkback-qos/vbd.c       2011-06-22 07:34:50.000000000 +0800
> > @@ -35,6 +35,11 @@
> >  #define vbd_sz(_v)   ((_v)->bdev->bd_part ?                                
> > \
> >     (_v)->bdev->bd_part->nr_sects : get_capacity((_v)->bdev->bd_disk))
> >  
> > +#undef DPRINTK
> > +#define DPRINTK(fmt, args...)                              \
> > +   printk("blkback/vbd (%s:%d) " fmt ".\n",        \
> > +            __FUNCTION__, __LINE__, ##args)
> > +
> >  unsigned long long vbd_size(struct vbd *vbd)
> >  {
> >     return vbd_sz(vbd);
> > @@ -87,7 +92,7 @@ int vbd_create(blkif_t *blkif, blkif_vde
> >     if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
> >             vbd->type |= VDISK_REMOVABLE;
> >  
> > -   DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
> > +   DPRINTK("Successful creation of handle=%04x (dom=%u)",
> >             handle, blkif->domid);
> >     return 0;
> >  }
> > diff -urNp blkback/xenbus.c blkback-qos/xenbus.c
> > --- blkback/xenbus.c        2010-05-20 18:07:00.000000000 +0800
> > +++ blkback-qos/xenbus.c    2011-06-22 07:34:50.000000000 +0800
> > @@ -25,13 +25,14 @@
> >  
> >  #undef DPRINTK
> >  #define DPRINTK(fmt, args...)                              \
> > -   pr_debug("blkback/xenbus (%s:%d) " fmt ".\n",   \
> > +   printk("blkback/xenbus (%s:%d) " fmt ".\n",     \
> >              __FUNCTION__, __LINE__, ##args)
> >  
> >  static void connect(struct backend_info *);
> >  static int connect_ring(struct backend_info *);
> >  static void backend_changed(struct xenbus_watch *, const char **,
> >                         unsigned int);
> > +static void unregister_rate_watch(struct backend_info *be);
> >  
> >  static int blkback_name(blkif_t *blkif, char *buf)
> >  {
> > @@ -59,8 +60,10 @@ static void update_blkif_status(blkif_t
> >     char name[TASK_COMM_LEN];
> >  
> >     /* Not ready to connect? */
> > -   if (!blkif->irq || !blkif->vbd.bdev)
> > +   if (!blkif->irq || !blkif->vbd.bdev){
> > +           DPRINTK("Not ready to connect");
> >             return;
> > +   }
> >  
> >     /* Already connected? */
> >     if (blkif->be->dev->state == XenbusStateConnected)
> > @@ -193,6 +196,8 @@ static int blkback_remove(struct xenbus_
> >             be->cdrom_watch.node = NULL;
> >     }
> >  
> > +   unregister_rate_watch(be);
> > +
> >     if (be->blkif) {
> >             blkif_disconnect(be->blkif);
> >             vbd_free(&be->blkif->vbd);
> > @@ -251,6 +256,10 @@ static int blkback_probe(struct xenbus_d
> >  
> >     err = xenbus_watch_path2(dev, dev->nodename, "physical-device",
> >                              &be->backend_watch, backend_changed);
> > +
> > +   DPRINTK("blkback_probe called");
> > +   DPRINTK("dev->nodename=%s/physical-device",dev->nodename);
> > +   
> >     if (err)
> >             goto fail;
> >  
> > @@ -266,7 +275,6 @@ fail:
> >     return err;
> >  }
> >  
> > -
> >  /**
> >   * Callback received when the hotplug scripts have placed the 
> > physical-device
> >   * node.  Read it and the mode node, and create a vbd.  If the frontend is
> > @@ -283,8 +291,9 @@ static void backend_changed(struct xenbu
> >     struct xenbus_device *dev = be->dev;
> >     int cdrom = 0;
> >     char *device_type;
> > +   char name[TASK_COMM_LEN];
> >  
> > -   DPRINTK("");
> > +   DPRINTK("backend_changed called");
> >  
> >     err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x",
> >                        &major, &minor);
> > @@ -322,6 +331,34 @@ static void backend_changed(struct xenbu
> >             kfree(device_type);
> >     }
> >  
> > +   /* gather information about QoS policy for this device. */
> > +   err = blkback_name(be->blkif, name);
> > +   if (err) {
> > +           xenbus_dev_error(be->dev, err, "get blkback dev name");
> > +           return;
> > +   }
> > +   
> > +   err = xenbus_gather(XBT_NIL, dev->otherend,
> > +                           "tokens-rate", "%d", &be->blkif->reqrate, 
> > +                           NULL);
> > +   if(err){
> > +           DPRINTK("%s xenbus_gather(tokens-min,tokens-rate) error",name);
> > +   }else{
> > +           if(be->blkif->reqrate <= 0){
> > +                   be->blkif->reqmin = 0 ;
> > +                   DPRINTK("%s tokens-rate == 0,no limit",name);   
> > +           }else{
> > +                   DPRINTK("%s 
> > xenbus_gather(tokens-rate=%d)",name,be->blkif->reqrate);
> > +                   be->blkif->reqrate *= 2;
> > +                   be->blkif->reqmin = VBD_QOS_MIN_RATE_LIMIT;
> > +                   if(be->blkif->reqmin > be->blkif->reqrate){
> > +                           be->blkif->reqrate = be->blkif->reqmin;
> > +                           DPRINTK("%s reset default 
> > value(tokens-rate=%d)",name,be->blkif->reqrate);
> > +                   }
> > +           }
> > +   }
> > +   be->blkif->reqtime = jiffies;
> > +
> >     if (be->major == 0 && be->minor == 0) {
> >             /* Front end dir is a number, which is used as the handle. */
> >  
> > @@ -414,6 +451,49 @@ static void frontend_changed(struct xenb
> >  
> >  /* ** Connection ** */
> >  
> > +static void unregister_rate_watch(struct backend_info *be)
> > +{
> > +   if (be->have_rate_watch) {
> > +           unregister_xenbus_watch(&be->rate_watch);
> > +           kfree(be->rate_watch.node);
> > +   }
> > +   be->have_rate_watch = 0;
> > +}
> > +
> > +static void rate_changed(struct xenbus_watch *watch,
> > +                       const char **vec, unsigned int len)
> > +{
> > +
> > +   struct backend_info *be=container_of(watch,struct backend_info, 
> > rate_watch);
> > +   int err;
> > +   char name[TASK_COMM_LEN];
> > +
> > +   err = blkback_name(be->blkif, name);
> > +   if (err) {
> > +           xenbus_dev_error(be->dev, err, "get blkback dev name");
> > +           return;
> > +   }
> > +
> > +   err = xenbus_gather(XBT_NIL,be->dev->otherend, 
> > +                                   "tokens-rate",  "%d", 
> > +                                   &be->blkif->reqrate,NULL);
> > +   if(err){
> > +           DPRINTK("%s xenbus_gather(tokens-rate) error",name);
> > +   }else{
> > +           if(be->blkif->reqrate <= 0){
> > +                   be->blkif->reqmin = 0;
> > +                   DPRINTK("%s tokens-rate == 0,no limit",name);   
> > +           }else{
> > +                   DPRINTK("%s 
> > xenbus_gather(tokens-rate=%d)",name,be->blkif->reqrate);
> > +                   be->blkif->reqrate *= 2;
> > +                   be->blkif->reqmin = VBD_QOS_MIN_RATE_LIMIT;
> > +                   if(be->blkif->reqmin > be->blkif->reqrate){
> > +                           be->blkif->reqrate = be->blkif->reqmin;
> > +                           DPRINTK("%s reset default 
> > value(tokens-rate=%d)",name,be->blkif->reqrate);
> > +                   }
> > +           }
> > +   }
> > +}
> >  
> >  /**
> >   * Write the physical details regarding the block device to the store, and
> > @@ -439,6 +519,14 @@ again:
> >     if (err)
> >             goto abort;
> >  
> > +   /*add by andrew for centos pv*/
> > +   err = xenbus_printf(xbt, dev->nodename,"feature-flush-cache", "1");
> > +   if (err){
> > +           xenbus_dev_fatal(dev, err, "writing %s/feature-flush-cache",
> > +                   dev->nodename);
> > +           goto abort;
> > +   }
> > +
> >     err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
> >                         vbd_size(&be->blkif->vbd));
> >     if (err) {
> > @@ -469,11 +557,22 @@ again:
> >     if (err)
> >             xenbus_dev_fatal(dev, err, "ending transaction");
> >  
> > +   DPRINTK("xenbus_switch_to XenbusStateConnected");
> > +
> >     err = xenbus_switch_state(dev, XenbusStateConnected);
> >     if (err)
> >             xenbus_dev_fatal(dev, err, "switching to Connected state",
> >                              dev->nodename);
> >  
> > +   unregister_rate_watch(be);
> > +   err=xenbus_watch_path2(dev, dev->otherend, "tokens-rate",
> > +                                                           
> > &be->rate_watch,rate_changed);
> > +   if (!err)
> > +           be->have_rate_watch = 1;
> > +   else
> > +           xenbus_dev_fatal(dev, err, "watching tokens-rate",
> > +                            dev->nodename);
> > +
> >     return;
> >   abort:
> >     xenbus_transaction_end(xbt, 1);
> 
> 
> > _______________________________________________
> > Xen-devel mailing list
> > Xen-devel@xxxxxxxxxxxxxxxxxxx
> > http://lists.xensource.com/xen-devel
> 
> 
> _______________________________________________
> Xen-users mailing list
> Xen-users@xxxxxxxxxxxxxxxxxxx
> http://lists.xensource.com/xen-users

**************************************************
徐安(Andrew Xu)
部门  :云快线 - 运营支撑中心 - 研发中心
手机  : 18910391796
E-mail:xu.an@xxxxxxxxxx
地址  :北京市朝阳区酒仙桥东路1号M5楼
**************************************************


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.