[Xen-devel] RE: VM hung after running sometime

I also meet HVM guest hang in our stress testing. For detail, pls see the 
bugzilla:
http://bugzilla.xensource.com/bugzilla/show_bug.cgi?id=1664


best regards
yang


> -----Original Message-----
> From: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
> [mailto:xen-devel-bounces@xxxxxxxxxxxxxxxxxxx] On Behalf Of Keir Fraser
> Sent: Sunday, September 19, 2010 7:50 PM
> To: MaoXiaoyun
> Cc: xen devel
> Subject: [Xen-devel] Re: VM hung after running sometime
> 
> On 19/09/2010 11:37, "MaoXiaoyun" <tinnycloud@xxxxxxxxxxx> wrote:
> 
> > Hi Keir:
> >
> >        Regards to HVM hang , according to our recent test, it turns out
> this
> > issue still exists.
> >        When I go through the code, I obseved something abnormal and
> need your
> > help.
> >
> >       We've noticed when VM hang, its VCPU flags is always 4, which
> indicates
> > _VPF_blocked_in_xen,
> >       and it is invoked in prepare_wait_on_xen_event_channel. I've noticed
> > that Domain U has setup
> >       a event channel  with domain 0 for each VCPU and qemu-dm select
> on the
> > event fd.
> >
> >       notify_via_xen_event_channel is called when Domain U issue a
> request.
> > And in qemu-dm it will
> >       get the event,  and invoke
> > cpu_handle_ioreq(/xen-4.0.0/tools/ioemu-qemu-xen/i386-dm/helper2.c)
> >      ->cpu_get_ioreq()->xc_evtchn_unmask(). In evtchn_unmask it will has
> > operation on evtchn_pending,
> >       evtchn_mask, or evtchn_pending_sel.
> >
> >       My confusion is on
> notify_via_xen_event_channel()->evtchn_set_pending,
> > the **evtchn_set_pending here
> >       in not locked**, while inside it also have operation on
> evtchn_pending,
> > evtchn_mask, or evtchn_pending_sel.
> 
> Atomic ops are used to make the operations on evtchn_pending, evtchn_mask,
> and evtchn_sel concurrency safe. Note that the locking from
> notify_via_xen_event_channel() is just the same as, say, from evtchn_send():
> the local domain's (ie. DomU's, in this case) event_lock is held, while the
> remote domain's (ie. dom0's, in this case) does not need to be held.
> 
> If your domU is stuck in state _VPF_blocked_in_xen, it probably means
> qemu-dm is toast. I would investigate whether the qemu-dm process is still
> present, still doing useful work, etc etc.
> 
>  -- Keir
> 
> >       I'm afried this access competition might cause event undeliverd from
> dom
> > U to qemu-dm, but I am not sure,
> >      since  I still not fully understand where event_mask and is set, and
> > where event_pending is cleared.
> >
> > -------------------------notify_via_xen_event_channel-------------------------
> > ------------
> >  989 void notify_via_xen_event_channel(int lport)
> >  990 {
> >  991     struct evtchn *lchn, *rchn;
> >  992     struct domain *ld = current->domain, *rd;
> >  993     int            rport;
> >  994
> >  995     spin_lock(&ld->event_lock);
> >  996
> >  997     ASSERT(port_is_valid(ld, lport));
> >  998     lchn = evtchn_from_port(ld, lport);
> >  999     ASSERT(lchn->consumer_is_xen);
> > 1000
> > 1001     if ( likely(lchn->state == ECS_INTERDOMAIN) )
> > 1002     {
> > 1003         rd    = lchn->u.interdomain.remote_dom;
> > 1004         rport = lchn->u.interdomain.remote_port;
> > 1005         rchn  = evtchn_from_port(rd, rport);
> > 1006         evtchn_set_pending(rd->vcpu[rchn->notify_vcpu_id], rport);
> > 1007     }
> > 1008
> > 1009     spin_unlock(&ld->event_lock);
> > 1010 }
> >
> > ----------------------------evtchn_set_pending----------------------
> > 535 static int evtchn_set_pending(struct vcpu *v, int port)
> >  536 {
> >  537     struct domain *d = v->domain;
> >  538     int vcpuid;
> >  539
> >  540     /*
> >  541      * The following bit operations must happen in strict order.
> >  542      * NB. On x86, the atomic bit operations also act as memory
> barriers.
> >  543      * There is therefore sufficiently strict ordering for this
> > architecture --
> >  544      * others may require explicit memory barriers.
> >  545      */
> >  546
> >  547     if ( test_and_set_bit(port, &shared_info(d, evtchn_pending)) )
> >  548         return 1;
> >  549
> >  550     if ( !test_bit        (port, &shared_info(d, evtchn_mask)) &&
> >  551          !test_and_set_bit(port / BITS_PER_EVTCHN_WORD(d),
> >  552                            &vcpu_info(v, evtchn_pending_sel)) )
> >  553     {
> >  554         vcpu_mark_events_pending(v);
> >  555     }
> >  556
> >  557     /* Check if some VCPU might be polling for this event. */
> >  558     if ( likely(bitmap_empty(d->poll_mask, d->max_vcpus)) )
> >  559         return 0;
> >  560
> >  561     /* Wake any interested (or potentially interested) pollers. */
> >  562     for ( vcpuid = find_first_bit(d->poll_mask, d->max_vcpus);
> >  563           vcpuid < d->max_vcpus;
> >  564           vcpuid = find_next_bit(d->poll_mask, d->max_vcpus,
> vcpuid+1) )
> >  565     {
> >  566         v = d->vcpu[vcpuid];
> >  567         if ( ((v->poll_evtchn <= 0) || (v->poll_evtchn == port)) &&
> >  568              test_and_clear_bit(vcpuid, d->poll_mask) )
> >  569         {
> >  570             v->poll_evtchn = 0;
> >  571             vcpu_unblock(v);
> >
> > --------------------------------------evtchn_unmask---------------------------
> > ---
> >  764
> >  765 int evtchn_unmask(unsigned int port)
> >  766 {
> >  767     struct domain *d = current->domain;
> >  768     struct vcpu   *v;
> >  769
> >  770     spin_lock(&d->event_lock);
> >  771
> >  772     if ( unlikely(!port_is_valid(d, port)) )
> >  773     {
> >  774         spin_unlock(&d->event_lock);
> >  775         return -EINVAL;
> >  776     }
> >  777
> >  778     v = d->vcpu[evtchn_from_port(d, port)->notify_vcpu_id];
> >  779
> >  780     /*
> >  781      * These operations must happen in strict order. Based on
> >  782      * include/xen/event.h:evtchn_set_pending().
> >  783      */
> >  784     if ( test_and_clear_bit(port, &shared_info(d, evtchn_mask)) &&
> >  785          test_bit          (port, &shared_info(d,
> evtchn_pending)) &&
> >  786          !test_and_set_bit (port / BITS_PER_EVTCHN_WORD(d),
> >  787                             &vcpu_info(v,
> evtchn_pending_sel)) )
> >  788     {
> >  789         vcpu_mark_events_pending(v);
> >  790     }
> >  791
> >  792     spin_unlock(&d->event_lock);
> >  793
> >  794     return 0;
> >  795 }
> >  ----------------------------cpu_get_ioreq-------------------------
> > 260 static ioreq_t *cpu_get_ioreq(void)
> > 261 {
> > 262     int i;
> > 263     evtchn_port_t port;
> > 264
> > 265     port = xc_evtchn_pending(xce_handle);
> > 266     if (port != -1) {
> > 267         for ( i = 0; i < vcpus; i++ )
> > 268             if ( ioreq_local_port[i] == port )
> > 269                 break;
> > 270
> > 271         if ( i == vcpus ) {
> > 272             fprintf(logfile, "Fatal error while trying to get io
> > event!\n");
> > 273             exit(1);
> > 274         }
> > 275
> > 276         // unmask the wanted port again
> > 277         xc_evtchn_unmask(xce_handle, port);
> > 278
> > 279         //get the io packet from shared memory
> > 280         send_vcpu = i;
> > 281         return __cpu_get_ioreq(i);
> > 282     }
> > 283
> > 284     //read error or read nothing
> > 285     return NULL;
> > 286 }
> > 287
> >
> >
> 
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@xxxxxxxxxxxxxxxxxxx
> http://lists.xensource.com/xen-devel

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
WARNING - OLD ARCHIVES

xen-devel

[Xen-devel] RE: VM hung after running sometime