Re: [Xen-devel] [PATCH 4 of 9] Rework locking in the PoD layer

I'll leave this alone for the moment, but I'll try to explain here the
end-goal:
1. we need to protect p2m entries on lookups, any lookups
2. If performance becomes prohibitive, then we need to break-up that lock
3. pod locking breaks, so pod will need its own lock
4. hence this patch
Agree with you it's ahead of the curve by removing p2m_lock's before
p2mm_lock's become fine-grained. So, I'll leave this on the side for now.

Andres
> On Thu, Oct 27, 2011 at 1:33 PM, Andres Lagar-Cavilla
> <andres@xxxxxxxxxxxxxxxx> wrote:
>>  xen/arch/x86/mm/mm-locks.h |    9 ++
>>  xen/arch/x86/mm/p2m-pod.c  |  145
>> +++++++++++++++++++++++++++------------------
>>  xen/arch/x86/mm/p2m-pt.c   |    3 +
>>  xen/arch/x86/mm/p2m.c      |    7 +-
>>  xen/include/asm-x86/p2m.h  |   25 ++-----
>>  5 files changed, 113 insertions(+), 76 deletions(-)
>>
>>
>> The PoD layer has a fragile locking discipline. It relies on the
>> p2m being globally locked, and it also relies on the page alloc
>> lock to protect some of its data structures. Replace this all by an
>> explicit pod lock: per p2m, order enforced.
>>
>> Two consequences:
>>    - Critical sections in the pod code protected by the page alloc
>>      lock are now reduced to modifications of the domain page list.
>>    - When the p2m lock becomes fine-grained, there are no
>>      assumptions broken in the PoD layer.
>>
>> Signed-off-by: Andres Lagar-Cavilla <andres@xxxxxxxxxxxxxxxx>
>>
>> diff -r 332775f72a30 -r 981073d78f7f xen/arch/x86/mm/mm-locks.h
>> --- a/xen/arch/x86/mm/mm-locks.h
>> +++ b/xen/arch/x86/mm/mm-locks.h
>> @@ -155,6 +155,15 @@ declare_mm_lock(p2m)
>>  #define p2m_unlock(p)         mm_unlock(&(p)->lock)
>>  #define p2m_locked_by_me(p)   mm_locked_by_me(&(p)->lock)
>>
>> +/* PoD lock (per-p2m-table)
>> + *
>> + * Protects private PoD data structs. */
>> +
>> +declare_mm_lock(pod)
>> +#define pod_lock(p)           mm_lock(pod, &(p)->pod.lock)
>> +#define pod_unlock(p)         mm_unlock(&(p)->pod.lock)
>> +#define pod_locked_by_me(p)   mm_locked_by_me(&(p)->pod.lock)
>> +
>>  /* Page alloc lock (per-domain)
>>  *
>>  * This is an external lock, not represented by an mm_lock_t. However,
>> diff -r 332775f72a30 -r 981073d78f7f xen/arch/x86/mm/p2m-pod.c
>> --- a/xen/arch/x86/mm/p2m-pod.c
>> +++ b/xen/arch/x86/mm/p2m-pod.c
>> @@ -63,6 +63,7 @@ static inline void unlock_page_alloc(str
>>  * Populate-on-demand functionality
>>  */
>>
>> +/* PoD lock held on entry */
>>  static int
>>  p2m_pod_cache_add(struct p2m_domain *p2m,
>>                   struct page_info *page,
>> @@ -114,43 +115,42 @@ p2m_pod_cache_add(struct p2m_domain *p2m
>>         unmap_domain_page(b);
>>     }
>>
>> +    /* First, take all pages off the domain list */
>>     lock_page_alloc(p2m);
>> -
>> -    /* First, take all pages off the domain list */
>>     for(i=0; i < 1 << order ; i++)
>>     {
>>         p = page + i;
>>         page_list_del(p, &d->page_list);
>>     }
>>
>> -    /* Then add the first one to the appropriate populate-on-demand
>> list */
>> -    switch(order)
>> -    {
>> -    case PAGE_ORDER_2M:
>> -        page_list_add_tail(page, &p2m->pod.super); /* lock: page_alloc
>> */
>> -        p2m->pod.count += 1 << order;
>> -        break;
>> -    case PAGE_ORDER_4K:
>> -        page_list_add_tail(page, &p2m->pod.single); /* lock: page_alloc
>> */
>> -        p2m->pod.count += 1;
>> -        break;
>> -    default:
>> -        BUG();
>> -    }
>> -
>>     /* Ensure that the PoD cache has never been emptied.
>>      * This may cause "zombie domains" since the page will never be
>> freed. */
>>     BUG_ON( d->arch.relmem != RELMEM_not_started );
>>
>>     unlock_page_alloc(p2m);
>>
>> +    /* Then add the first one to the appropriate populate-on-demand
>> list */
>> +    switch(order)
>> +    {
>> +    case PAGE_ORDER_2M:
>> +        page_list_add_tail(page, &p2m->pod.super);
>> +        p2m->pod.count += 1 << order;
>> +        break;
>> +    case PAGE_ORDER_4K:
>> +        page_list_add_tail(page, &p2m->pod.single);
>> +        p2m->pod.count += 1;
>> +        break;
>> +    default:
>> +        BUG();
>> +    }
>> +
>>     return 0;
>>  }
>>
>>  /* Get a page of size order from the populate-on-demand cache.  Will
>> break
>>  * down 2-meg pages into singleton pages automatically.  Returns null if
>> - * a superpage is requested and no superpages are available.  Must be
>> called
>> - * with the d->page_lock held. */
>> + * a superpage is requested and no superpages are available. */
>> +/* PoD lock held on entry */
>>  static struct page_info * p2m_pod_cache_get(struct p2m_domain *p2m,
>>                                             unsigned long order)
>>  {
>> @@ -185,7 +185,7 @@ static struct page_info * p2m_pod_cache_
>>     case PAGE_ORDER_2M:
>>         BUG_ON( page_list_empty(&p2m->pod.super) );
>>         p = page_list_remove_head(&p2m->pod.super);
>> -        p2m->pod.count -= 1 << order; /* Lock: page_alloc */
>> +        p2m->pod.count -= 1 << order;
>>         break;
>>     case PAGE_ORDER_4K:
>>         BUG_ON( page_list_empty(&p2m->pod.single) );
>> @@ -197,16 +197,19 @@ static struct page_info * p2m_pod_cache_
>>     }
>>
>>     /* Put the pages back on the domain page_list */
>> +    lock_page_alloc(p2m);
>>     for ( i = 0 ; i < (1 << order); i++ )
>>     {
>>         BUG_ON(page_get_owner(p + i) != p2m->domain);
>>         page_list_add_tail(p + i, &p2m->domain->page_list);
>>     }
>> +    unlock_page_alloc(p2m);
>>
>>     return p;
>>  }
>>
>>  /* Set the size of the cache, allocating or freeing as necessary. */
>> +/* PoD lock held on entry */
>>  static int
>>  p2m_pod_set_cache_target(struct p2m_domain *p2m, unsigned long
>> pod_target, int preemptible)
>>  {
>> @@ -259,8 +262,6 @@ p2m_pod_set_cache_target(struct p2m_doma
>>
>>         /* Grab the lock before checking that pod.super is empty, or the
>> last
>>          * entries may disappear before we grab the lock. */
>> -        lock_page_alloc(p2m);
>> -
>>         if ( (p2m->pod.count - pod_target) > SUPERPAGE_PAGES
>>              && !page_list_empty(&p2m->pod.super) )
>>             order = PAGE_ORDER_2M;
>> @@ -271,8 +272,6 @@ p2m_pod_set_cache_target(struct p2m_doma
>>
>>         ASSERT(page != NULL);
>>
>> -        unlock_page_alloc(p2m);
>> -
>>         /* Then free them */
>>         for ( i = 0 ; i < (1 << order) ; i++ )
>>         {
>> @@ -348,7 +347,7 @@ p2m_pod_set_mem_target(struct domain *d,
>>     int ret = 0;
>>     unsigned long populated;
>>
>> -    p2m_lock(p2m);
>> +    pod_lock(p2m);
>>
>>     /* P == B: Nothing to do. */
>>     if ( p2m->pod.entry_count == 0 )
>> @@ -377,7 +376,7 @@ p2m_pod_set_mem_target(struct domain *d,
>>     ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
>>
>>  out:
>> -    p2m_unlock(p2m);
>> +    pod_unlock(p2m);
>>
>>     return ret;
>>  }
>> @@ -390,7 +389,7 @@ p2m_pod_empty_cache(struct domain *d)
>>
>>     /* After this barrier no new PoD activities can happen. */
>>     BUG_ON(!d->is_dying);
>> -    spin_barrier(&p2m->lock.lock);
>> +    spin_barrier(&p2m->pod.lock.lock);
>>
>>     lock_page_alloc(p2m);
>>
>> @@ -431,7 +430,8 @@ p2m_pod_offline_or_broken_hit(struct pag
>>     if ( !(d = page_get_owner(p)) || !(p2m = p2m_get_hostp2m(d)) )
>>         return 0;
>>
>> -    lock_page_alloc(p2m);
>> +    pod_lock(p2m);
>> +
>>     bmfn = mfn_x(page_to_mfn(p));
>>     page_list_for_each_safe(q, tmp, &p2m->pod.super)
>>     {
>> @@ -462,12 +462,14 @@ p2m_pod_offline_or_broken_hit(struct pag
>>         }
>>     }
>>
>> -    unlock_page_alloc(p2m);
>> +    pod_unlock(p2m);
>>     return 0;
>>
>>  pod_hit:
>> +    lock_page_alloc(p2m);
>>     page_list_add_tail(p, &d->arch.relmem_list);
>>     unlock_page_alloc(p2m);
>> +    pod_unlock(p2m);
>>     return 1;
>>  }
>>
>> @@ -486,9 +488,9 @@ p2m_pod_offline_or_broken_replace(struct
>>     if ( unlikely(!p) )
>>         return;
>>
>> -    p2m_lock(p2m);
>> +    pod_lock(p2m);
>>     p2m_pod_cache_add(p2m, p, PAGE_ORDER_4K);
>> -    p2m_unlock(p2m);
>> +    pod_unlock(p2m);
>>     return;
>>  }
>>
>> @@ -512,6 +514,7 @@ p2m_pod_decrease_reservation(struct doma
>>     int steal_for_cache = 0;
>>     int pod = 0, nonpod = 0, ram = 0;
>>
>> +    pod_lock(p2m);
>>
>>     /* If we don't have any outstanding PoD entries, let things take
>> their
>>      * course */
>> @@ -521,11 +524,10 @@ p2m_pod_decrease_reservation(struct doma
>>     /* Figure out if we need to steal some freed memory for our cache */
>>     steal_for_cache =  ( p2m->pod.entry_count > p2m->pod.count );
>>
>> -    p2m_lock(p2m);
>>     audit_p2m(p2m, 1);
>>
>>     if ( unlikely(d->is_dying) )
>> -        goto out_unlock;
>> +        goto out;
>>
>>     /* See what's in here. */
>>     /* FIXME: Add contiguous; query for PSE entries? */
>
> I don't think this can be quite right.
>
> The point of holding the p2m lock here is so that the p2m entries
> don't change between the gfn_to_mfn_query() here and the
> set_p2m_entries() below.  The balloon driver racing with other vcpus
> populating pages is exactly the kind of race we expect to experience.
> And in any case, this change will cause set_p2m_entry() to ASSERT()
> because we're not holding the p2m lock.
>
> Or am I missing something?
>
> I haven't yet looked at the rest of the patch series, but it would
> definitely be better for people in the future looking back and trying
> to figure out why the code is the way that it is if even transitory
> changesets don't introduce "temporary" violations of invariants. :-)
>
>> @@ -547,14 +549,14 @@ p2m_pod_decrease_reservation(struct doma
>>
>>     /* No populate-on-demand?  Don't need to steal anything?  Then we're
>> done!*/
>>     if(!pod && !steal_for_cache)
>> -        goto out_unlock;
>> +        goto out_audit;
>>
>>     if ( !nonpod )
>>     {
>>         /* All PoD: Mark the whole region invalid and tell caller
>>          * we're done. */
>>         set_p2m_entry(p2m, gpfn, _mfn(INVALID_MFN), order, p2m_invalid,
>> p2m->default_access);
>> -        p2m->pod.entry_count-=(1<<order); /* Lock: p2m */
>> +        p2m->pod.entry_count-=(1<<order);
>>         BUG_ON(p2m->pod.entry_count < 0);
>>         ret = 1;
>>         goto out_entry_check;
>> @@ -577,7 +579,7 @@ p2m_pod_decrease_reservation(struct doma
>>         if ( t == p2m_populate_on_demand )
>>         {
>>             set_p2m_entry(p2m, gpfn + i, _mfn(INVALID_MFN), 0,
>> p2m_invalid, p2m->default_access);
>> -            p2m->pod.entry_count--; /* Lock: p2m */
>> +            p2m->pod.entry_count--;
>>             BUG_ON(p2m->pod.entry_count < 0);
>>             pod--;
>>         }
>> @@ -613,11 +615,11 @@ out_entry_check:
>>         p2m_pod_set_cache_target(p2m, p2m->pod.entry_count, 0/*can't
>> preempt*/);
>>     }
>>
>> -out_unlock:
>> +out_audit:
>>     audit_p2m(p2m, 1);
>> -    p2m_unlock(p2m);
>>
>>  out:
>> +    pod_unlock(p2m);
>>     return ret;
>>  }
>>
>> @@ -630,20 +632,24 @@ void p2m_pod_dump_data(struct domain *d)
>>
>>
>>  /* Search for all-zero superpages to be reclaimed as superpages for the
>> - * PoD cache. Must be called w/ p2m lock held, page_alloc lock not
>> held. */
>> -static int
>> + * PoD cache. Must be called w/ pod lock held, page_alloc lock not
>> held. */
>> +static void
>
> For the same reason, this must be called with the p2m lock held: it
> calls gfn_to_mfn_query() and then calls set_p2m_entry().  As it
> happens, this always *is* called with the p2m lock held at the moment;
> but the comment still needs to reflect this.  Similarly in
> p2m_pod_zero_check().
>
>>  p2m_pod_zero_check_superpage(struct p2m_domain *p2m, unsigned long gfn)
>>  {
>>     mfn_t mfn, mfn0 = _mfn(INVALID_MFN);
>>     p2m_type_t type, type0 = 0;
>>     unsigned long * map = NULL;
>> -    int ret=0, reset = 0;
>> +    int success = 0, reset = 0;
>>     int i, j;
>>     int max_ref = 1;
>>     struct domain *d = p2m->domain;
>>
>>     if ( !superpage_aligned(gfn) )
>> -        goto out;
>> +        return;
>> +
>> +    /* If we were enforcing ordering against p2m locks, this is a place
>> +     * to drop the PoD lock and re-acquire it once we're done mucking
>> with
>> +     * the p2m. */
>>
>>     /* Allow an extra refcount for one shadow pt mapping in shadowed
>> domains */
>>     if ( paging_mode_shadow(d) )
>> @@ -751,19 +757,24 @@ p2m_pod_zero_check_superpage(struct p2m_
>>         __trace_var(TRC_MEM_POD_ZERO_RECLAIM, 0, sizeof(t), &t);
>>     }
>>
>> -    /* Finally!  We've passed all the checks, and can add the mfn
>> superpage
>> -     * back on the PoD cache, and account for the new p2m PoD entries
>> */
>> -    p2m_pod_cache_add(p2m, mfn_to_page(mfn0), PAGE_ORDER_2M);
>> -    p2m->pod.entry_count += SUPERPAGE_PAGES;
>> +    success = 1;
>> +
>>
>>  out_reset:
>>     if ( reset )
>>         set_p2m_entry(p2m, gfn, mfn0, 9, type0, p2m->default_access);
>>
>>  out:
>> -    return ret;
>> +    if ( success )
>> +    {
>> +        /* Finally!  We've passed all the checks, and can add the mfn
>> superpage
>> +         * back on the PoD cache, and account for the new p2m PoD
>> entries */
>> +        p2m_pod_cache_add(p2m, mfn_to_page(mfn0), PAGE_ORDER_2M);
>> +        p2m->pod.entry_count += SUPERPAGE_PAGES;
>> +    }
>>  }
>>
>> +/* On entry, PoD lock is held */
>>  static void
>>  p2m_pod_zero_check(struct p2m_domain *p2m, unsigned long *gfns, int
>> count)
>>  {
>> @@ -775,6 +786,8 @@ p2m_pod_zero_check(struct p2m_domain *p2
>>     int i, j;
>>     int max_ref = 1;
>>
>> +    /* Also the right time to drop pod_lock if enforcing ordering
>> against p2m_lock */
>> +
>>     /* Allow an extra refcount for one shadow pt mapping in shadowed
>> domains */
>>     if ( paging_mode_shadow(d) )
>>         max_ref++;
>> @@ -841,7 +854,6 @@ p2m_pod_zero_check(struct p2m_domain *p2
>>             if( *(map[i]+j) != 0 )
>>                 break;
>>
>> -        unmap_domain_page(map[i]);
>>
>>         /* See comment in p2m_pod_zero_check_superpage() re gnttab
>>          * check timing.  */
>> @@ -849,8 +861,15 @@ p2m_pod_zero_check(struct p2m_domain *p2
>>         {
>>             set_p2m_entry(p2m, gfns[i], mfns[i], PAGE_ORDER_4K,
>>                 types[i], p2m->default_access);
>> +            unmap_domain_page(map[i]);
>> +            map[i] = NULL;
>>         }
>> -        else
>> +    }
>> +
>> +    /* Finally, add to cache */
>> +    for ( i=0; i < count; i++ )
>> +    {
>> +        if ( map[i] )
>>         {
>>             if ( tb_init_done )
>>             {
>> @@ -867,6 +886,8 @@ p2m_pod_zero_check(struct p2m_domain *p2
>>                 __trace_var(TRC_MEM_POD_ZERO_RECLAIM, 0, sizeof(t), &t);
>>             }
>>
>> +            unmap_domain_page(map[i]);
>> +
>>             /* Add to cache, and account for the new p2m PoD entry */
>>             p2m_pod_cache_add(p2m, mfn_to_page(mfns[i]), PAGE_ORDER_4K);
>>             p2m->pod.entry_count++;
>> @@ -876,6 +897,7 @@ p2m_pod_zero_check(struct p2m_domain *p2
>>  }
>>
>>  #define POD_SWEEP_LIMIT 1024
>> +/* Only one CPU at a time is guaranteed to enter a sweep */
>>  static void
>>  p2m_pod_emergency_sweep_super(struct p2m_domain *p2m)
>>  {
>> @@ -964,7 +986,8 @@ p2m_pod_demand_populate(struct p2m_domai
>>
>>     ASSERT(p2m_locked_by_me(p2m));
>>
>> -    /* This check is done with the p2m lock held.  This will make sure
>> that
>> +    pod_lock(p2m);
>> +    /* This check is done with the pod lock held.  This will make sure
>> that
>>      * even if d->is_dying changes under our feet, p2m_pod_empty_cache()
>>      * won't start until we're done. */
>>     if ( unlikely(d->is_dying) )
>> @@ -974,6 +997,7 @@ p2m_pod_demand_populate(struct p2m_domai
>>      * 1GB region to 2MB chunks for a retry. */
>>     if ( order == PAGE_ORDER_1G )
>>     {
>> +        pod_unlock(p2m);
>>         gfn_aligned = (gfn >> order) << order;
>>         /* Note that we are supposed to call set_p2m_entry() 512 times
>> to
>>          * split 1GB into 512 2MB pages here. But We only do once here
>> because
>> @@ -983,6 +1007,7 @@ p2m_pod_demand_populate(struct p2m_domai
>>         set_p2m_entry(p2m, gfn_aligned, _mfn(0), PAGE_ORDER_2M,
>>                       p2m_populate_on_demand, p2m->default_access);
>>         audit_p2m(p2m, 1);
>> +        /* This is because the ept/pt caller locks the p2m recursively
>> */
>>         p2m_unlock(p2m);
>>         return 0;
>>     }
>> @@ -996,11 +1021,15 @@ p2m_pod_demand_populate(struct p2m_domai
>>
>>         /* If we're low, start a sweep */
>>         if ( order == PAGE_ORDER_2M && page_list_empty(&p2m->pod.super)
>> )
>> +            /* Note that sweeps scan other ranges in the p2m. In an
>> scenario
>> +             * in which p2m locks are order-enforced wrt pod lock and
>> p2m
>> +             * locks are fine grained, this will result in deadlock */
>>             p2m_pod_emergency_sweep_super(p2m);
>>
>>         if ( page_list_empty(&p2m->pod.single) &&
>>              ( ( order == PAGE_ORDER_4K )
>>                || (order == PAGE_ORDER_2M &&
>> page_list_empty(&p2m->pod.super) ) ) )
>> +            /* Same comment regarding deadlock applies */
>>             p2m_pod_emergency_sweep(p2m);
>>     }
>>
>> @@ -1008,8 +1037,6 @@ p2m_pod_demand_populate(struct p2m_domai
>>     if ( q == p2m_guest && gfn > p2m->pod.max_guest )
>>         p2m->pod.max_guest = gfn;
>>
>> -    lock_page_alloc(p2m);
>> -
>>     if ( p2m->pod.count == 0 )
>>         goto out_of_memory;
>>
>> @@ -1022,8 +1049,6 @@ p2m_pod_demand_populate(struct p2m_domai
>>
>>     BUG_ON((mfn_x(mfn) & ((1 << order)-1)) != 0);
>>
>> -    unlock_page_alloc(p2m);
>> -
>>     gfn_aligned = (gfn >> order) << order;
>>
>>     set_p2m_entry(p2m, gfn_aligned, mfn, order, p2m_ram_rw,
>> p2m->default_access);
>> @@ -1034,8 +1059,9 @@ p2m_pod_demand_populate(struct p2m_domai
>>         paging_mark_dirty(d, mfn_x(mfn) + i);
>>     }
>>
>> -    p2m->pod.entry_count -= (1 << order); /* Lock: p2m */
>> +    p2m->pod.entry_count -= (1 << order);
>>     BUG_ON(p2m->pod.entry_count < 0);
>> +    pod_unlock(p2m);
>>
>>     if ( tb_init_done )
>>     {
>> @@ -1054,16 +1080,17 @@ p2m_pod_demand_populate(struct p2m_domai
>>
>>     return 0;
>>  out_of_memory:
>> -    unlock_page_alloc(p2m);
>> +    pod_unlock(p2m);
>>
>>     printk("%s: Out of populate-on-demand memory! tot_pages %" PRIu32 "
>> pod_entries %" PRIi32 "\n",
>>            __func__, d->tot_pages, p2m->pod.entry_count);
>>     domain_crash(d);
>>  out_fail:
>> +    pod_unlock(p2m);
>>     return -1;
>>  remap_and_retry:
>>     BUG_ON(order != PAGE_ORDER_2M);
>> -    unlock_page_alloc(p2m);
>> +    pod_unlock(p2m);
>>
>>     /* Remap this 2-meg region in singleton chunks */
>>     gfn_aligned = (gfn>>order)<<order;
>> @@ -1133,9 +1160,11 @@ guest_physmap_mark_populate_on_demand(st
>>         rc = -EINVAL;
>>     else
>>     {
>> -        p2m->pod.entry_count += 1 << order; /* Lock: p2m */
>> +        pod_lock(p2m);
>> +        p2m->pod.entry_count += 1 << order;
>>         p2m->pod.entry_count -= pod_count;
>>         BUG_ON(p2m->pod.entry_count < 0);
>> +        pod_unlock(p2m);
>>     }
>>
>>     audit_p2m(p2m, 1);
>> diff -r 332775f72a30 -r 981073d78f7f xen/arch/x86/mm/p2m-pt.c
>> --- a/xen/arch/x86/mm/p2m-pt.c
>> +++ b/xen/arch/x86/mm/p2m-pt.c
>> @@ -1001,6 +1001,7 @@ void audit_p2m(struct p2m_domain *p2m, i
>>     if ( !paging_mode_translate(d) )
>>         return;
>>
>> +    pod_lock(p2m);
>>     //P2M_PRINTK("p2m audit starts\n");
>>
>>     test_linear = ( (d == current->domain)
>> @@ -1247,6 +1248,8 @@ void audit_p2m(struct p2m_domain *p2m, i
>>                    pmbad, mpbad);
>>         WARN();
>>     }
>> +
>> +    pod_unlock(p2m);
>>  }
>>  #endif /* P2M_AUDIT */
>>
>> diff -r 332775f72a30 -r 981073d78f7f xen/arch/x86/mm/p2m.c
>> --- a/xen/arch/x86/mm/p2m.c
>> +++ b/xen/arch/x86/mm/p2m.c
>> @@ -72,6 +72,7 @@ boolean_param("hap_2mb", opt_hap_2mb);
>>  static void p2m_initialise(struct domain *d, struct p2m_domain *p2m)
>>  {
>>     mm_lock_init(&p2m->lock);
>> +    mm_lock_init(&p2m->pod.lock);
>>     INIT_LIST_HEAD(&p2m->np2m_list);
>>     INIT_PAGE_LIST_HEAD(&p2m->pages);
>>     INIT_PAGE_LIST_HEAD(&p2m->pod.super);
>> @@ -506,8 +507,10 @@ guest_physmap_add_entry(struct domain *d
>>             rc = -EINVAL;
>>         else
>>         {
>> -            p2m->pod.entry_count -= pod_count; /* Lock: p2m */
>> +            pod_lock(p2m);
>> +            p2m->pod.entry_count -= pod_count;
>>             BUG_ON(p2m->pod.entry_count < 0);
>> +            pod_unlock(p2m);
>>         }
>>     }
>>
>> @@ -1125,8 +1128,10 @@ p2m_flush_table(struct p2m_domain *p2m)
>>     /* "Host" p2m tables can have shared entries &c that need a bit more
>>      * care when discarding them */
>>     ASSERT(p2m_is_nestedp2m(p2m));
>> +    pod_lock(p2m);
>>     ASSERT(page_list_empty(&p2m->pod.super));
>>     ASSERT(page_list_empty(&p2m->pod.single));
>> +    pod_unlock(p2m);
>>
>>     /* This is no longer a valid nested p2m for any address space */
>>     p2m->cr3 = CR3_EADDR;
>> diff -r 332775f72a30 -r 981073d78f7f xen/include/asm-x86/p2m.h
>> --- a/xen/include/asm-x86/p2m.h
>> +++ b/xen/include/asm-x86/p2m.h
>> @@ -257,24 +257,13 @@ struct p2m_domain {
>>     unsigned long max_mapped_pfn;
>>
>>     /* Populate-on-demand variables
>> -     * NB on locking.  {super,single,count} are
>> -     * covered by d->page_alloc_lock, since they're almost always used
>> in
>> -     * conjunction with that functionality.  {entry_count} is covered
>> by
>> -     * the domain p2m lock, since it's almost always used in
>> conjunction
>> -     * with changing the p2m tables.
>>      *
>> -     * At this point, both locks are held in two places.  In both,
>> -     * the order is [p2m,page_alloc]:
>> -     * + p2m_pod_decrease_reservation() calls p2m_pod_cache_add(),
>> -     *   which grabs page_alloc
>> -     * + p2m_pod_demand_populate() grabs both; the p2m lock to avoid
>> -     *   double-demand-populating of pages, the page_alloc lock to
>> -     *   protect moving stuff from the PoD cache to the domain page
>> list.
>> -     *
>> -     * We enforce this lock ordering through a construct in mm-locks.h.
>> -     * This demands, however, that we store the previous lock-ordering
>> -     * level in effect before grabbing the page_alloc lock.
>> -     */
>> +     * All variables are protected with the pod lock. We cannot rely on
>> +     * the p2m lock if it's turned into a fine-grained lock.
>> +     * We only use the domain page_alloc lock for additions and
>> +     * deletions to the domain's page list. Because we use it nested
>> +     * within the PoD lock, we enforce it's ordering (by remembering
>> +     * the unlock level). */
>>     struct {
>>         struct page_list_head super,   /* List of superpages            
>>    */
>>                          single;       /* Non-super lists              
>>     */
>> @@ -283,6 +272,8 @@ struct p2m_domain {
>>         unsigned         reclaim_super; /* Last gpfn of a scan */
>>         unsigned         reclaim_single; /* Last gpfn of a scan */
>>         unsigned         max_guest;    /* gpfn of max guest
>> demand-populate */
>> +        mm_lock_t        lock;         /* Locking of private pod
>> structs,   *
>> +                                        * not relying on the p2m lock.
>>      */
>>         int              page_alloc_unlock_level; /* To enforce lock
>> ordering */
>>     } pod;
>>  };
>>
>> _______________________________________________
>> Xen-devel mailing list
>> Xen-devel@xxxxxxxxxxxxxxxxxxx
>> http://lists.xensource.com/xen-devel
>>
>



_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
WARNING - OLD ARCHIVES

xen-devel

Re: [Xen-devel] [PATCH 4 of 9] Rework locking in the PoD layer