Perhaps try dropping calls to domain_kill() into memory_exchange()? It may
not complete the destroy, but it will set ->is_dying and trigger your error
paths. Then you can 'xm destroy' from the tools afterwards to try to finish
up the destroy, and see if you get left with a zombie or not.
Looking at this patch I can immediately see that:
A. Your extra check(s) of is_dying are pointless. Only the check in
assign_pages() is critical. So just leave it to that.
B. Your adjustment of tot_pages is confusing. I can't convince myself the
arithmetic is correct. Furthermore messing with tot_pages outside of the
d->page_alloc_lock is not allowed.
-- Keir
On 30/06/2009 08:55, "Jiang, Yunhong" <yunhong.jiang@xxxxxxxxx> wrote:
> Keir, this is the patch based on the discussion before. Can you please have a
> look on it? I can't triger the corner case of domain being dying, so I hope it
> can be achieved through code review.
>
> Thanks
> Yunhong Jiang
>
> Extend memory_exchange to support exchange memory for foreign domain.
> When domain is killed during the memory exchange, it will try to decrease
> domain's page count that has been removed from page_list.
>
> Signed-off-by: Jiang, Yunhong <yunhong.jiang@xxxxxxxxx>
>
> diff -r 02003bee3e80 xen/common/memory.c
> --- a/xen/common/memory.c Thu Jun 25 18:31:10 2009 +0100
> +++ b/xen/common/memory.c Tue Jun 30 01:44:28 2009 +0800
> @@ -265,16 +265,22 @@ static long memory_exchange(XEN_GUEST_HA
> out_chunk_order = exch.in.extent_order - exch.out.extent_order;
> }
>
> - /*
> - * Only support exchange on calling domain right now. Otherwise there are
> - * tricky corner cases to consider (e.g., dying domain).
> - */
> - if ( unlikely(exch.in.domid != DOMID_SELF) )
> - {
> - rc = IS_PRIV(current->domain) ? -EINVAL : -EPERM;
> - goto fail_early;
> - }
> - d = current->domain;
> + if ( likely(exch.in.domid == DOMID_SELF) )
> + {
> + d = rcu_lock_current_domain();
> + }
> + else
> + {
> + if ( (d = rcu_lock_domain_by_id(exch.in.domid)) == NULL )
> + goto fail_early;
> +
> + if ( !IS_PRIV_FOR(current->domain, d) )
> + {
> + rcu_unlock_domain(d);
> + rc = -EPERM;
> + goto fail_early;
> + }
> + }
>
> memflags |= MEMF_bits(domain_clamp_alloc_bitsize(
> d,
> @@ -292,11 +298,15 @@ static long memory_exchange(XEN_GUEST_HA
> if ( hypercall_preempt_check() )
> {
> exch.nr_exchanged = i << in_chunk_order;
> + rcu_unlock_domain(d);
> if ( copy_field_to_guest(arg, &exch, nr_exchanged) )
> return -EFAULT;
> return hypercall_create_continuation(
> __HYPERVISOR_memory_op, "lh", XENMEM_exchange, arg);
> }
> +
> + if (d->is_dying)
> + goto dying;
>
> /* Steal a chunk's worth of input pages from the domain. */
> for ( j = 0; j < (1UL << in_chunk_order); j++ )
> @@ -360,9 +370,24 @@ static long memory_exchange(XEN_GUEST_HA
> j = 0;
> while ( (page = page_list_remove_head(&out_chunk_list)) )
> {
> - if ( assign_pages(d, page, exch.out.extent_order,
> - MEMF_no_refcount) )
> + rc = (assign_pages(d, page, exch.out.extent_order,
> + MEMF_no_refcount));
> +
> + if (rc == -EINVAL)
> BUG();
> + /* Domain is being destroy */
> + else if (rc == -ENODEV)
> + {
> + int dec_count;
> + dec_count = ( ( 1UL << exch.in.extent_order)*
> + (1UL << in_chunk_order) -
> + j * (1UL << exch.out.extent_order));
> + d->tot_pages -= dec_count;
> +
> + if (dec_count && !d->tot_pages)
> + put_domain(d);
> + break;
> + }
>
> /* Note that we ignore errors accessing the output extent list.
> */
> (void)__copy_from_guest_offset(
> @@ -378,15 +403,15 @@ static long memory_exchange(XEN_GUEST_HA
> (void)__copy_to_guest_offset(
> exch.out.extent_start, (i<<out_chunk_order)+j, &mfn, 1);
> }
> -
> j++;
> }
> - BUG_ON(j != (1UL << out_chunk_order));
> + BUG_ON( !(d->is_dying) && (j != (1UL << out_chunk_order)) );
> }
>
> exch.nr_exchanged = exch.in.nr_extents;
> if ( copy_field_to_guest(arg, &exch, nr_exchanged) )
> rc = -EFAULT;
> + rcu_unlock_domain(d);
> return rc;
>
> /*
> @@ -398,7 +423,8 @@ static long memory_exchange(XEN_GUEST_HA
> while ( (page = page_list_remove_head(&in_chunk_list)) )
> if ( assign_pages(d, page, 0, MEMF_no_refcount) )
> BUG();
> -
> + dying:
> + rcu_unlock_domain(d);
> /* Free any output pages we managed to allocate. */
> while ( (page = page_list_remove_head(&out_chunk_list)) )
> free_domheap_pages(page, exch.out.extent_order);
> diff -r 02003bee3e80 xen/common/page_alloc.c
> --- a/xen/common/page_alloc.c Thu Jun 25 18:31:10 2009 +0100
> +++ b/xen/common/page_alloc.c Sun Jun 28 23:39:28 2009 +0800
> @@ -1120,6 +1120,7 @@ int assign_pages(
> unsigned int memflags)
> {
> unsigned long i;
> + int rc = -EINVAL;
>
> spin_lock(&d->page_alloc_lock);
>
> @@ -1127,6 +1128,7 @@ int assign_pages(
> {
> gdprintk(XENLOG_INFO, "Cannot assign page to domain%d -- dying.\n",
> d->domain_id);
> + rc = -ENODEV;
> goto fail;
> }
>
> @@ -1136,6 +1138,7 @@ int assign_pages(
> {
> gdprintk(XENLOG_INFO, "Over-allocation for domain %u: %u > %u\n",
> d->domain_id, d->tot_pages + (1 << order), d->max_pages);
> + rc = -EINVAL;
> goto fail;
> }
>
> @@ -1160,7 +1163,7 @@ int assign_pages(
>
> fail:
> spin_unlock(&d->page_alloc_lock);
> - return -1;
> + return rc;
> }
>
>
>
>
> Keir Fraser wrote:
>> On 01/06/2009 09:40, "Jiang, Yunhong" <yunhong.jiang@xxxxxxxxx> wrote:
>>
>>> Thanks for suggestion. I'm always nervous on API changes.
>>>
>>> I'm still considering if any other potential usage mode for patch
>>> 5/6l (i.e. change page table or exchange memory for other domain),,
>>> but frustratedly realize no other requirement.
>>
>> The API for XENMEM_exchange already permits specifying a
>> foreign domain, so
>> you're just filling in the missing implementation. By the way I did
>> not check your implementation, but the major subtlety is that you
>> must take care
>> for domains which are dying (d->is_dying). Giving new pages to
>> such a domain
>> is generally a bug because you race
>> domain_relinquish_resources() which is
>> walking the domain's page lists and freeing the pages. You
>> therefore risk
>> leaving a zombie domain which will not die (refcount never zero).
>>
>> See for example how that is handled with page_alloc.c for calls such
>> as XENMEM_populate_physmap, and how it is handled specially by
>> MMUEXT_PIN_L*_TABLE.
>>
>> -- Keir
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
|