The CPU masks embedded in these structures prevent NR_CPUS-independent
sizing of these structures.

Basic concept (in xen/include/cpumask.h) taken from recent Linux.

For scalability purposes, many other uses of cpumask_t should be
replaced by cpumask_var_t, particularly local variables of functions.
This implies that no functions should have by-value cpumask_t
parameters, and that the whole old cpumask interface (cpus_...())
should go away in favor of the new (cpumask_...()) one.

Signed-off-by: Jan Beulich <jbeulich@xxxxxxxxxx>

--- a/xen/arch/ia64/xen/mm.c
+++ b/xen/arch/ia64/xen/mm.c
@@ -3191,8 +3191,9 @@ int get_page_type(struct page_info *page
                  * may be unnecessary (e.g., page was GDT/LDT) but those 
                  * circumstances should be very rare.
                  */
-                cpumask_t mask =
-                    page_get_owner(page)->domain_dirty_cpumask;
+                cpumask_t mask;
+
+                cpumask_copy(&mask, page_get_owner(page)->domain_dirty_cpumask);
                 tlbflush_filter(mask, page->tlbflush_timestamp);
 
                 if ( unlikely(!cpus_empty(mask)) )
--- a/xen/arch/ia64/xen/vhpt.c
+++ b/xen/arch/ia64/xen/vhpt.c
@@ -516,7 +516,7 @@ void domain_flush_tlb_vhpt(struct domain
 		on_each_cpu((void (*)(void *))local_flush_tlb_all, NULL, 1);
 	else
 		on_each_cpu((void (*)(void *))flush_tlb_vhpt_all, d, 1);
-	cpus_clear (d->domain_dirty_cpumask);
+	cpumask_clear_cpu(d->domain_dirty_cpumask);
 }
 
 void flush_tlb_for_log_dirty(struct domain *d)
@@ -545,7 +545,7 @@ void flush_tlb_for_log_dirty(struct doma
 	} else {
 		on_each_cpu((void (*)(void *))flush_tlb_vhpt_all, d, 1);
 	}
-	cpus_clear (d->domain_dirty_cpumask);
+	cpumask_clear_cpu(d->domain_dirty_cpumask);
 }
 
 void flush_tlb_mask(const cpumask_t *mask)
--- a/xen/arch/x86/cpu/mcheck/vmce.c
+++ b/xen/arch/x86/cpu/mcheck/vmce.c
@@ -321,8 +321,8 @@ int inject_vmce(struct domain *d)
                        d->domain_id);
             if ( guest_has_trap_callback(d, 0, TRAP_machine_check) )
             {
-                d->vcpu[0]->cpu_affinity_tmp =
-                    d->vcpu[0]->cpu_affinity;
+                cpumask_copy(d->vcpu[0]->cpu_affinity_tmp,
+                             d->vcpu[0]->cpu_affinity);
                 cpus_clear(affinity);
                 cpu_set(cpu, affinity);
                 mce_printk(MCE_VERBOSE, "MCE: CPU%d set affinity, old %d\n",
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -132,8 +132,8 @@ void startup_cpu_idle_loop(void)
     struct vcpu *v = current;
 
     ASSERT(is_idle_vcpu(v));
-    cpu_set(smp_processor_id(), v->domain->domain_dirty_cpumask);
-    cpu_set(smp_processor_id(), v->vcpu_dirty_cpumask);
+    cpumask_set_cpu(v->processor, v->domain->domain_dirty_cpumask);
+    cpumask_set_cpu(v->processor, v->vcpu_dirty_cpumask);
 
     reset_stack_and_jump(idle_loop);
 }
@@ -1391,7 +1391,7 @@ static void __context_switch(void)
     struct desc_ptr       gdt_desc;
 
     ASSERT(p != n);
-    ASSERT(cpus_empty(n->vcpu_dirty_cpumask));
+    ASSERT(cpumask_empty(n->vcpu_dirty_cpumask));
 
     if ( !is_idle_vcpu(p) )
     {
@@ -1408,8 +1408,8 @@ static void __context_switch(void)
      * which is synchronised on that function.
      */
     if ( p->domain != n->domain )
-        cpu_set(cpu, n->domain->domain_dirty_cpumask);
-    cpu_set(cpu, n->vcpu_dirty_cpumask);
+        cpumask_set_cpu(cpu, n->domain->domain_dirty_cpumask);
+    cpumask_set_cpu(cpu, n->vcpu_dirty_cpumask);
 
     if ( !is_idle_vcpu(n) )
     {
@@ -1452,8 +1452,8 @@ static void __context_switch(void)
     }
 
     if ( p->domain != n->domain )
-        cpu_clear(cpu, p->domain->domain_dirty_cpumask);
-    cpu_clear(cpu, p->vcpu_dirty_cpumask);
+        cpumask_clear_cpu(cpu, p->domain->domain_dirty_cpumask);
+    cpumask_clear_cpu(cpu, p->vcpu_dirty_cpumask);
 
     per_cpu(curr_vcpu, cpu) = n;
 }
@@ -1462,10 +1462,11 @@ static void __context_switch(void)
 void context_switch(struct vcpu *prev, struct vcpu *next)
 {
     unsigned int cpu = smp_processor_id();
-    cpumask_t dirty_mask = next->vcpu_dirty_cpumask;
+    cpumask_t dirty_mask;
 
     ASSERT(local_irq_is_enabled());
 
+    cpumask_copy(&dirty_mask, next->vcpu_dirty_cpumask);
     /* Allow at most one CPU at a time to be dirty. */
     ASSERT(cpus_weight(dirty_mask) <= 1);
     if ( unlikely(!cpu_isset(cpu, dirty_mask) && !cpus_empty(dirty_mask)) )
@@ -1557,11 +1558,11 @@ void sync_local_execstate(void)
 
 void sync_vcpu_execstate(struct vcpu *v)
 {
-    if ( cpu_isset(smp_processor_id(), v->vcpu_dirty_cpumask) )
+    if ( cpumask_test_cpu(smp_processor_id(), v->vcpu_dirty_cpumask) )
         sync_local_execstate();
 
     /* Other cpus call __sync_local_execstate from flush ipi handler. */
-    flush_tlb_mask(&v->vcpu_dirty_cpumask);
+    flush_tlb_mask(v->vcpu_dirty_cpumask);
 }
 
 #define next_arg(fmt, args) ({                                              \
@@ -1922,7 +1923,7 @@ int domain_relinquish_resources(struct d
     int ret;
     struct vcpu *v;
 
-    BUG_ON(!cpus_empty(d->domain_dirty_cpumask));
+    BUG_ON(!cpumask_empty(d->domain_dirty_cpumask));
 
     switch ( d->arch.relmem )
     {
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -3089,7 +3089,7 @@ static int hvmop_flush_tlb_all(void)
         paging_update_cr3(v);
 
     /* Flush all dirty TLBs. */
-    flush_tlb_mask(&d->domain_dirty_cpumask);
+    flush_tlb_mask(d->domain_dirty_cpumask);
 
     /* Done. */
     for_each_vcpu ( d, v )
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -1348,7 +1348,7 @@ static int svm_is_erratum_383(struct cpu
     wrmsrl(MSR_IA32_MCG_STATUS, msr_content & ~(1ULL << 2));
 
     /* flush TLB */
-    flush_tlb_mask(&v->domain->domain_dirty_cpumask);
+    flush_tlb_mask(v->domain->domain_dirty_cpumask);
 
     return 1;
 }
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -88,8 +88,14 @@ static int vmx_domain_initialise(struct 
     d->arch.hvm_domain.vmx.ept_control.asr  =
         pagetable_get_pfn(p2m_get_pagetable(p2m_get_hostp2m(d)));
 
+    if ( !zalloc_cpumask_var(&d->arch.hvm_domain.vmx.ept_synced) )
+        return -ENOMEM;
+
     if ( (rc = vmx_alloc_vlapic_mapping(d)) != 0 )
+    {
+        free_cpumask_var(d->arch.hvm_domain.vmx.ept_synced);
         return rc;
+    }
 
     return 0;
 }
@@ -98,6 +104,7 @@ static void vmx_domain_destroy(struct do
 {
     if ( paging_mode_hap(d) )
         on_each_cpu(__ept_sync_domain, d, 1);
+    free_cpumask_var(d->arch.hvm_domain.vmx.ept_synced);
     vmx_free_vlapic_mapping(d);
 }
 
@@ -660,8 +667,9 @@ static void vmx_ctxt_switch_to(struct vc
     {
         unsigned int cpu = smp_processor_id();
         /* Test-and-test-and-set this CPU in the EPT-is-synced mask. */
-        if ( !cpu_isset(cpu, d->arch.hvm_domain.vmx.ept_synced) &&
-             !cpu_test_and_set(cpu, d->arch.hvm_domain.vmx.ept_synced) )
+        if ( !cpumask_test_cpu(cpu, d->arch.hvm_domain.vmx.ept_synced) &&
+             !cpumask_test_and_set_cpu(cpu,
+                                       d->arch.hvm_domain.vmx.ept_synced) )
             __invept(INVEPT_SINGLE_CONTEXT, ept_get_eptp(d), 0);
     }
 
@@ -1217,10 +1225,10 @@ void ept_sync_domain(struct domain *d)
      * the ept_synced mask before on_selected_cpus() reads it, resulting in
      * unnecessary extra flushes, to avoid allocating a cpumask_t on the stack.
      */
-    cpus_and(d->arch.hvm_domain.vmx.ept_synced,
-             d->domain_dirty_cpumask, cpu_online_map);
+    cpumask_and(d->arch.hvm_domain.vmx.ept_synced,
+                d->domain_dirty_cpumask, &cpu_online_map);
 
-    on_selected_cpus(&d->arch.hvm_domain.vmx.ept_synced,
+    on_selected_cpus(d->arch.hvm_domain.vmx.ept_synced,
                      __ept_sync_domain, d, 1);
 }
 
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -612,7 +612,7 @@ static void invalidate_shadow_ldt(struct
 
     /* Rid TLBs of stale mappings (guest mappings and shadow mappings). */
     if ( flush )
-        flush_tlb_mask(&v->vcpu_dirty_cpumask);
+        flush_tlb_mask(v->vcpu_dirty_cpumask);
 
  out:
     spin_unlock(&v->arch.shadow_ldt_lock);
@@ -1338,7 +1338,7 @@ static void pae_flush_pgd(
             if ( pagetable_get_pfn(v->arch.guest_table) == mfn )
             {
                 paging_update_cr3(v);
-                cpus_or(m, m, v->vcpu_dirty_cpumask);
+                cpumask_or(&m, &m, v->vcpu_dirty_cpumask);
             }
         flush_tlb_mask(&m);
     }
@@ -1365,7 +1365,7 @@ static void pae_flush_pgd(
         spin_unlock(&cache->lock);
     }
 
-    flush_tlb_mask(&d->domain_dirty_cpumask);
+    flush_tlb_mask(d->domain_dirty_cpumask);
 }
 #else
 # define pae_flush_pgd(mfn, idx, nl3e) ((void)0)
@@ -2421,7 +2421,9 @@ static int __get_page_type(struct page_i
                  * may be unnecessary (e.g., page was GDT/LDT) but those 
                  * circumstances should be very rare.
                  */
-                cpumask_t mask = d->domain_dirty_cpumask;
+                cpumask_t mask;
+
+                cpumask_copy(&mask, d->domain_dirty_cpumask);
 
                 /* Don't flush if the timestamp is old enough */
                 tlbflush_filter(mask, page->tlbflush_timestamp);
@@ -2903,7 +2905,7 @@ static inline int vcpumask_to_pcpumask(
             if ( (vcpu_id >= d->max_vcpus) )
                 return 0;
             if ( ((v = d->vcpu[vcpu_id]) != NULL) )
-                cpus_or(*pmask, *pmask, v->vcpu_dirty_cpumask);
+                cpumask_or(pmask, pmask, v->vcpu_dirty_cpumask);
         }
     }
 }
@@ -3161,11 +3163,11 @@ int do_mmuext_op(
         }
 
         case MMUEXT_TLB_FLUSH_ALL:
-            flush_tlb_mask(&d->domain_dirty_cpumask);
+            flush_tlb_mask(d->domain_dirty_cpumask);
             break;
     
         case MMUEXT_INVLPG_ALL:
-            flush_tlb_one_mask(&d->domain_dirty_cpumask, op.arg1.linear_addr);
+            flush_tlb_one_mask(d->domain_dirty_cpumask, op.arg1.linear_addr);
             break;
 
         case MMUEXT_FLUSH_CACHE:
@@ -4345,7 +4347,7 @@ static int __do_update_va_mapping(
             flush_tlb_local();
             break;
         case UVMF_ALL:
-            flush_tlb_mask(&d->domain_dirty_cpumask);
+            flush_tlb_mask(d->domain_dirty_cpumask);
             break;
         default:
             rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr,
@@ -4365,7 +4367,7 @@ static int __do_update_va_mapping(
                 flush_tlb_one_local(va);
             break;
         case UVMF_ALL:
-            flush_tlb_one_mask(&d->domain_dirty_cpumask, va);
+            flush_tlb_one_mask(d->domain_dirty_cpumask, va);
             break;
         default:
             rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr,
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -72,7 +72,7 @@ static int hap_enable_vram_tracking(stru
     for (i = dirty_vram->begin_pfn; i < dirty_vram->end_pfn; i++)
         p2m_change_type(p2m_get_hostp2m(d), i, p2m_ram_rw, p2m_ram_logdirty);
 
-    flush_tlb_mask(&d->domain_dirty_cpumask);
+    flush_tlb_mask(d->domain_dirty_cpumask);
     return 0;
 }
 
@@ -92,7 +92,7 @@ static int hap_disable_vram_tracking(str
     for (i = dirty_vram->begin_pfn; i < dirty_vram->end_pfn; i++)
         p2m_change_type(p2m_get_hostp2m(d), i, p2m_ram_logdirty, p2m_ram_rw);
 
-    flush_tlb_mask(&d->domain_dirty_cpumask);
+    flush_tlb_mask(d->domain_dirty_cpumask);
     return 0;
 }
 
@@ -108,7 +108,7 @@ static void hap_clean_vram_tracking(stru
     for (i = dirty_vram->begin_pfn; i < dirty_vram->end_pfn; i++)
         p2m_change_type(p2m_get_hostp2m(d), i, p2m_ram_rw, p2m_ram_logdirty);
 
-    flush_tlb_mask(&d->domain_dirty_cpumask);
+    flush_tlb_mask(d->domain_dirty_cpumask);
 }
 
 static void hap_vram_tracking_init(struct domain *d)
@@ -202,7 +202,7 @@ static int hap_enable_log_dirty(struct d
     /* set l1e entries of P2M table to be read-only. */
     p2m_change_entry_type_global(p2m_get_hostp2m(d),
         p2m_ram_rw, p2m_ram_logdirty);
-    flush_tlb_mask(&d->domain_dirty_cpumask);
+    flush_tlb_mask(d->domain_dirty_cpumask);
     return 0;
 }
 
@@ -223,7 +223,7 @@ static void hap_clean_dirty_bitmap(struc
     /* set l1e entries of P2M table to be read-only. */
     p2m_change_entry_type_global(p2m_get_hostp2m(d),
         p2m_ram_rw, p2m_ram_logdirty);
-    flush_tlb_mask(&d->domain_dirty_cpumask);
+    flush_tlb_mask(d->domain_dirty_cpumask);
 }
 
 void hap_logdirty_init(struct domain *d)
@@ -842,7 +842,7 @@ hap_write_p2m_entry(struct vcpu *v, unsi
     safe_write_pte(p, new);
     if ( (old_flags & _PAGE_PRESENT)
          && (level == 1 || (level == 2 && (old_flags & _PAGE_PSE))) )
-             flush_tlb_mask(&v->domain->domain_dirty_cpumask);
+             flush_tlb_mask(v->domain->domain_dirty_cpumask);
 
 #if CONFIG_PAGING_LEVELS == 3
     /* install P2M in monitor table for PAE Xen */
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -703,7 +703,7 @@ static int oos_remove_write_access(struc
     }
 
     if ( ftlb )
-        flush_tlb_mask(&v->domain->domain_dirty_cpumask);
+        flush_tlb_mask(v->domain->domain_dirty_cpumask);
 
     return 0;
 }
@@ -1153,7 +1153,7 @@ sh_validate_guest_pt_write(struct vcpu *
     rc = sh_validate_guest_entry(v, gmfn, entry, size);
     if ( rc & SHADOW_SET_FLUSH )
         /* Need to flush TLBs to pick up shadow PT changes */
-        flush_tlb_mask(&d->domain_dirty_cpumask);
+        flush_tlb_mask(d->domain_dirty_cpumask);
     if ( rc & SHADOW_SET_ERROR ) 
     {
         /* This page is probably not a pagetable any more: tear it out of the 
@@ -1369,7 +1369,7 @@ static void _shadow_prealloc(
                 /* See if that freed up enough space */
                 if ( d->arch.paging.shadow.free_pages >= pages )
                 {
-                    flush_tlb_mask(&d->domain_dirty_cpumask);
+                    flush_tlb_mask(d->domain_dirty_cpumask);
                     return;
                 }
             }
@@ -1422,7 +1422,7 @@ static void shadow_blow_tables(struct do
                                pagetable_get_mfn(v->arch.shadow_table[i]), 0);
 
     /* Make sure everyone sees the unshadowings */
-    flush_tlb_mask(&d->domain_dirty_cpumask);
+    flush_tlb_mask(d->domain_dirty_cpumask);
 }
 
 void shadow_blow_tables_per_domain(struct domain *d)
@@ -1535,7 +1535,7 @@ mfn_t shadow_alloc(struct domain *d,  
         sp = page_list_remove_head(&d->arch.paging.shadow.freelist);
         /* Before we overwrite the old contents of this page, 
          * we need to be sure that no TLB holds a pointer to it. */
-        mask = d->domain_dirty_cpumask;
+        cpumask_copy(&mask, d->domain_dirty_cpumask);
         tlbflush_filter(mask, sp->tlbflush_timestamp);
         if ( unlikely(!cpus_empty(mask)) )
         {
@@ -2767,7 +2767,7 @@ void sh_remove_shadows(struct vcpu *v, m
 
     /* Need to flush TLBs now, so that linear maps are safe next time we 
      * take a fault. */
-    flush_tlb_mask(&v->domain->domain_dirty_cpumask);
+    flush_tlb_mask(v->domain->domain_dirty_cpumask);
 
     if ( do_locking ) shadow_unlock(v->domain);
 }
@@ -3474,7 +3474,7 @@ static void sh_unshadow_for_p2m_change(s
         {
             sh_remove_all_shadows_and_parents(v, mfn);
             if ( sh_remove_all_mappings(v, mfn) )
-                flush_tlb_mask(&d->domain_dirty_cpumask);
+                flush_tlb_mask(d->domain_dirty_cpumask);
         }
     }
 
@@ -3509,7 +3509,8 @@ static void sh_unshadow_for_p2m_change(s
                     /* This GFN->MFN mapping has gone away */
                     sh_remove_all_shadows_and_parents(v, omfn);
                     if ( sh_remove_all_mappings(v, omfn) )
-                        cpus_or(flushmask, flushmask, d->domain_dirty_cpumask);
+                        cpumask_or(&flushmask, &flushmask,
+                                   d->domain_dirty_cpumask);
                 }
                 omfn = _mfn(mfn_x(omfn) + 1);
             }
@@ -3806,7 +3807,7 @@ int shadow_track_dirty_vram(struct domai
         }
     }
     if ( flush_tlb )
-        flush_tlb_mask(&d->domain_dirty_cpumask);
+        flush_tlb_mask(d->domain_dirty_cpumask);
     goto out;
 
 out_sl1ma:
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -3248,7 +3248,7 @@ static int sh_page_fault(struct vcpu *v,
          */
         perfc_incr(shadow_rm_write_flush_tlb);
         atomic_inc(&d->arch.paging.shadow.gtable_dirty_version);
-        flush_tlb_mask(&d->domain_dirty_cpumask);
+        flush_tlb_mask(d->domain_dirty_cpumask);
     }
 
 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
@@ -4294,7 +4294,7 @@ sh_update_cr3(struct vcpu *v, int do_loc
      * (old) shadow linear maps in the writeable mapping heuristics. */
 #if GUEST_PAGING_LEVELS == 2
     if ( sh_remove_write_access(v, gmfn, 2, 0) != 0 )
-        flush_tlb_mask(&v->domain->domain_dirty_cpumask);
+        flush_tlb_mask(d->domain_dirty_cpumask);
     sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow);
 #elif GUEST_PAGING_LEVELS == 3
     /* PAE guests have four shadow_table entries, based on the 
@@ -4317,7 +4317,7 @@ sh_update_cr3(struct vcpu *v, int do_loc
             }
         }
         if ( flush ) 
-            flush_tlb_mask(&v->domain->domain_dirty_cpumask);
+            flush_tlb_mask(d->domain_dirty_cpumask);
         /* Now install the new shadows. */
         for ( i = 0; i < 4; i++ ) 
         {
@@ -4338,7 +4338,7 @@ sh_update_cr3(struct vcpu *v, int do_loc
     }
 #elif GUEST_PAGING_LEVELS == 4
     if ( sh_remove_write_access(v, gmfn, 4, 0) != 0 )
-        flush_tlb_mask(&v->domain->domain_dirty_cpumask);
+        flush_tlb_mask(d->domain_dirty_cpumask);
     sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow);
 #else
 #error This should never happen 
@@ -4755,7 +4755,7 @@ static void sh_pagetable_dying(struct vc
         }
     }
     if ( flush )
-        flush_tlb_mask(&v->domain->domain_dirty_cpumask);
+        flush_tlb_mask(v->domain->domain_dirty_cpumask);
 
     /* Remember that we've seen the guest use this interface, so we
      * can rely on it using it in future, instead of guessing at
@@ -4788,7 +4788,7 @@ static void sh_pagetable_dying(struct vc
         mfn_to_page(gmfn)->shadow_flags |= SHF_pagetable_dying;
         shadow_unhook_mappings(v, smfn, 1/* user pages only */);
         /* Now flush the TLB: we removed toplevel mappings. */
-        flush_tlb_mask(&v->domain->domain_dirty_cpumask);
+        flush_tlb_mask(v->domain->domain_dirty_cpumask);
     }
 
     /* Remember that we've seen the guest use this interface, so we
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -2965,7 +2965,7 @@ static void nmi_mce_softirq(void)
 
     /* Set the tmp value unconditionally, so that
      * the check in the iret hypercall works. */
-    st->vcpu->cpu_affinity_tmp = st->vcpu->cpu_affinity;
+    cpumask_copy(st->vcpu->cpu_affinity_tmp, st->vcpu->cpu_affinity);
 
     if ((cpu != st->processor)
        || (st->processor != st->vcpu->processor))
@@ -2996,11 +2996,11 @@ void async_exception_cleanup(struct vcpu
         return;
 
     /* Restore affinity.  */
-    if ( !cpus_empty(curr->cpu_affinity_tmp) &&
-         !cpus_equal(curr->cpu_affinity_tmp, curr->cpu_affinity) )
+    if ( !cpumask_empty(curr->cpu_affinity_tmp) &&
+         !cpumask_equal(curr->cpu_affinity_tmp, curr->cpu_affinity) )
     {
-        vcpu_set_affinity(curr, &curr->cpu_affinity_tmp);
-        cpus_clear(curr->cpu_affinity_tmp);
+        vcpu_set_affinity(curr, curr->cpu_affinity_tmp);
+        cpumask_clear(curr->cpu_affinity_tmp);
     }
 
     if ( !(curr->async_exception_mask & (curr->async_exception_mask - 1)) )
@@ -3048,7 +3048,7 @@ void async_exception_cleanup(struct vcpu
                 int cpu = smp_processor_id();
                 cpumask_t affinity;
 
-                curr->cpu_affinity_tmp = curr->cpu_affinity;
+                cpumask_copy(curr->cpu_affinity_tmp, curr->cpu_affinity);
                 cpus_clear(affinity);
                 cpu_set(cpu, affinity);
                 printk(XENLOG_DEBUG "MCE: CPU%d set affinity, old %d\n",
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -151,6 +151,11 @@ struct vcpu *alloc_vcpu(
 
     tasklet_init(&v->continue_hypercall_tasklet, NULL, 0);
 
+    if ( !zalloc_cpumask_var(&v->cpu_affinity) ||
+         !zalloc_cpumask_var(&v->cpu_affinity_tmp) ||
+         !zalloc_cpumask_var(&v->vcpu_dirty_cpumask) )
+        goto fail_free;
+
     if ( is_idle_domain(d) )
     {
         v->runstate.state = RUNSTATE_running;
@@ -167,16 +172,17 @@ struct vcpu *alloc_vcpu(
     }
 
     if ( sched_init_vcpu(v, cpu_id) != 0 )
-    {
-        destroy_waitqueue_vcpu(v);
-        free_vcpu_struct(v);
-        return NULL;
-    }
+        goto fail_wq;
 
     if ( vcpu_initialise(v) != 0 )
     {
         sched_destroy_vcpu(v);
+ fail_wq:
         destroy_waitqueue_vcpu(v);
+ fail_free:
+        free_cpumask_var(v->cpu_affinity);
+        free_cpumask_var(v->cpu_affinity_tmp);
+        free_cpumask_var(v->vcpu_dirty_cpumask);
         free_vcpu_struct(v);
         return NULL;
     }
@@ -246,6 +252,9 @@ struct domain *domain_create(
     spin_lock_init(&d->shutdown_lock);
     d->shutdown_code = -1;
 
+    if ( !zalloc_cpumask_var(&d->domain_dirty_cpumask) )
+        goto fail;
+
     if ( domcr_flags & DOMCRF_hvm )
         d->is_hvm = 1;
 
@@ -346,6 +355,7 @@ struct domain *domain_create(
         xsm_free_security_domain(d);
     xfree(d->pirq_mask);
     xfree(d->pirq_to_evtchn);
+    free_cpumask_var(d->domain_dirty_cpumask);
     free_domain_struct(d);
     return NULL;
 }
@@ -361,7 +371,7 @@ void domain_update_node_affinity(struct 
     spin_lock(&d->node_affinity_lock);
 
     for_each_vcpu ( d, v )
-        cpus_or(cpumask, cpumask, v->cpu_affinity);
+        cpumask_or(&cpumask, &cpumask, v->cpu_affinity);
 
     for_each_online_node ( node )
         if ( cpus_intersects(node_to_cpumask(node), cpumask) )
@@ -658,7 +668,12 @@ static void complete_domain_destroy(stru
 
     for ( i = d->max_vcpus - 1; i >= 0; i-- )
         if ( (v = d->vcpu[i]) != NULL )
+        {
+            free_cpumask_var(v->cpu_affinity);
+            free_cpumask_var(v->cpu_affinity_tmp);
+            free_cpumask_var(v->vcpu_dirty_cpumask);
             free_vcpu_struct(v);
+        }
 
     if ( d->target != NULL )
         put_domain(d->target);
@@ -669,6 +684,7 @@ static void complete_domain_destroy(stru
     xfree(d->pirq_to_evtchn);
 
     xsm_free_security_domain(d);
+    free_cpumask_var(d->domain_dirty_cpumask);
     free_domain_struct(d);
 
     send_guest_global_virq(dom0, VIRQ_DOM_EXC);
@@ -789,7 +805,7 @@ void vcpu_reset(struct vcpu *v)
     v->async_exception_mask = 0;
     memset(v->async_exception_state, 0, sizeof(v->async_exception_state));
 #endif
-    cpus_clear(v->cpu_affinity_tmp);
+    cpumask_clear(v->cpu_affinity_tmp);
     clear_bit(_VPF_blocked, &v->pause_flags);
 
     domain_unlock(v->domain);
--- a/xen/common/domctl.c
+++ b/xen/common/domctl.c
@@ -589,7 +589,7 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
         else
         {
             ret = cpumask_to_xenctl_cpumap(
-                &op->u.vcpuaffinity.cpumap, &v->cpu_affinity);
+                &op->u.vcpuaffinity.cpumap, v->cpu_affinity);
         }
 
     vcpuaffinity_out:
--- a/xen/common/grant_table.c
+++ b/xen/common/grant_table.c
@@ -1013,7 +1013,7 @@ gnttab_unmap_grant_ref(
                 goto fault;
         }
 
-        flush_tlb_mask(&current->domain->domain_dirty_cpumask);
+        flush_tlb_mask(current->domain->domain_dirty_cpumask);
 
         for ( i = 0; i < partial_done; i++ )
             __gnttab_unmap_common_complete(&(common[i]));
@@ -1028,7 +1028,7 @@ gnttab_unmap_grant_ref(
     return 0;
 
 fault:
-    flush_tlb_mask(&current->domain->domain_dirty_cpumask);
+    flush_tlb_mask(current->domain->domain_dirty_cpumask);
 
     for ( i = 0; i < partial_done; i++ )
         __gnttab_unmap_common_complete(&(common[i]));
@@ -1075,7 +1075,7 @@ gnttab_unmap_and_replace(
                 goto fault;
         }
         
-        flush_tlb_mask(&current->domain->domain_dirty_cpumask);
+        flush_tlb_mask(current->domain->domain_dirty_cpumask);
         
         for ( i = 0; i < partial_done; i++ )
             __gnttab_unmap_common_complete(&(common[i]));
@@ -1090,7 +1090,7 @@ gnttab_unmap_and_replace(
     return 0;
 
 fault:
-    flush_tlb_mask(&current->domain->domain_dirty_cpumask);
+    flush_tlb_mask(current->domain->domain_dirty_cpumask);
 
     for ( i = 0; i < partial_done; i++ )
         __gnttab_unmap_common_complete(&(common[i]));
@@ -1496,7 +1496,7 @@ gnttab_transfer(
 #ifndef __ia64__ /* IA64 implicitly replaces the old page in steal_page(). */
         guest_physmap_remove_page(d, gop.mfn, mfn, 0);
 #endif
-        flush_tlb_mask(&d->domain_dirty_cpumask);
+        flush_tlb_mask(d->domain_dirty_cpumask);
 
         /* Find the target domain. */
         if ( unlikely((e = rcu_lock_domain_by_id(gop.domid)) == NULL) )
--- a/xen/common/keyhandler.c
+++ b/xen/common/keyhandler.c
@@ -243,7 +243,7 @@ static void dump_domains(unsigned char k
     {
         unsigned int i;
         printk("General information for domain %u:\n", d->domain_id);
-        cpuset_print(tmpstr, sizeof(tmpstr), d->domain_dirty_cpumask);
+        cpuset_print(tmpstr, sizeof(tmpstr), *d->domain_dirty_cpumask);
         printk("    refcnt=%d dying=%d nr_pages=%d xenheap_pages=%d "
                "dirty_cpus=%s max_pages=%u\n",
                atomic_read(&d->refcnt), d->is_dying,
@@ -277,9 +277,9 @@ static void dump_domains(unsigned char k
                    v->pause_flags, v->poll_evtchn,
                    vcpu_info(v, evtchn_upcall_pending),
                    vcpu_info(v, evtchn_upcall_mask));
-            cpuset_print(tmpstr, sizeof(tmpstr), v->vcpu_dirty_cpumask);
+            cpuset_print(tmpstr, sizeof(tmpstr), *v->vcpu_dirty_cpumask);
             printk("dirty_cpus=%s ", tmpstr);
-            cpuset_print(tmpstr, sizeof(tmpstr), v->cpu_affinity);
+            cpuset_print(tmpstr, sizeof(tmpstr), *v->cpu_affinity);
             printk("cpu_affinity=%s\n", tmpstr);
             arch_dump_vcpu_info(v);
             periodic_timer_print(tmpstr, sizeof(tmpstr), v->periodic_period);
--- a/xen/common/sched_credit.c
+++ b/xen/common/sched_credit.c
@@ -292,7 +292,7 @@ __runq_tickle(unsigned int cpu, struct c
         {
             cpumask_t idle_mask;
 
-            cpus_and(idle_mask, prv->idlers, new->vcpu->cpu_affinity);
+            cpumask_and(&idle_mask, &prv->idlers, new->vcpu->cpu_affinity);
             if ( !cpus_empty(idle_mask) )
             {
                 CSCHED_STAT_CRANK(tickle_idlers_some);
@@ -305,7 +305,7 @@ __runq_tickle(unsigned int cpu, struct c
                 else
                     cpus_or(mask, mask, idle_mask);
             }
-            cpus_and(mask, mask, new->vcpu->cpu_affinity);
+            cpumask_and(&mask, &mask, new->vcpu->cpu_affinity);
         }
     }
 
@@ -455,7 +455,7 @@ __csched_vcpu_is_migrateable(struct vcpu
      */
     return !vc->is_running &&
            !__csched_vcpu_is_cache_hot(vc) &&
-           cpu_isset(dest_cpu, vc->cpu_affinity);
+           cpumask_test_cpu(dest_cpu, vc->cpu_affinity);
 }
 
 static int
@@ -472,7 +472,7 @@ _csched_cpu_pick(const struct scheduler 
      * preference to its current processor if it's in there.
      */
     online = CSCHED_CPUONLINE(vc->domain->cpupool);
-    cpus_and(cpus, *online, vc->cpu_affinity);
+    cpumask_and(&cpus, online, vc->cpu_affinity);
     cpu = cpu_isset(vc->processor, cpus)
             ? vc->processor
             : cycle_cpu(vc->processor, cpus);
--- a/xen/common/sched_sedf.c
+++ b/xen/common/sched_sedf.c
@@ -448,7 +448,7 @@ static int sedf_pick_cpu(const struct sc
     cpumask_t *online;
 
     online = SEDF_CPUONLINE(v->domain->cpupool);
-    cpus_and(online_affinity, v->cpu_affinity, *online);
+    cpumask_and(&online_affinity, v->cpu_affinity, online);
     return first_cpu(online_affinity);
 }
 
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -196,9 +196,9 @@ int sched_init_vcpu(struct vcpu *v, unsi
      */
     v->processor = processor;
     if ( is_idle_domain(d) || d->is_pinned )
-        v->cpu_affinity = cpumask_of_cpu(processor);
+        cpumask_copy(v->cpu_affinity, cpumask_of(processor));
     else
-        cpus_setall(v->cpu_affinity);
+        cpumask_setall(v->cpu_affinity);
 
     /* Initialise the per-vcpu timers. */
     init_timer(&v->periodic_timer, vcpu_periodic_timer_fn,
@@ -273,7 +273,7 @@ int sched_move_domain(struct domain *d, 
         SCHED_OP(VCPU2OP(v), remove_vcpu, v);
         SCHED_OP(VCPU2OP(v), free_vdata, v->sched_priv);
 
-        cpus_setall(v->cpu_affinity);
+        cpumask_setall(v->cpu_affinity);
         v->processor = new_p;
         v->sched_priv = vcpu_priv[v->vcpu_id];
         evtchn_move_pirqs(v);
@@ -435,7 +435,7 @@ static void vcpu_migrate(struct vcpu *v)
              */
             if ( pick_called &&
                  (new_lock == per_cpu(schedule_data, new_cpu).schedule_lock) &&
-                 cpu_isset(new_cpu, v->cpu_affinity) &&
+                 cpumask_test_cpu(new_cpu, v->cpu_affinity) &&
                  cpu_isset(new_cpu, v->domain->cpupool->cpu_valid) )
                 break;
 
@@ -550,13 +550,13 @@ int cpu_disable_scheduler(unsigned int c
         {
             vcpu_schedule_lock_irq(v);
 
-            cpus_and(online_affinity, v->cpu_affinity, c->cpu_valid);
+            cpumask_and(&online_affinity, v->cpu_affinity, &c->cpu_valid);
             if ( cpus_empty(online_affinity) &&
-                 cpu_isset(cpu, v->cpu_affinity) )
+                 cpumask_test_cpu(cpu, v->cpu_affinity) )
             {
                 printk("Breaking vcpu affinity for domain %d vcpu %d\n",
                         v->domain->domain_id, v->vcpu_id);
-                cpus_setall(v->cpu_affinity);
+                cpumask_setall(v->cpu_affinity);
                 affinity_broken = 1;
             }
 
@@ -602,10 +602,10 @@ int vcpu_set_affinity(struct vcpu *v, cp
 
     vcpu_schedule_lock_irq(v);
 
-    old_affinity = v->cpu_affinity;
-    v->cpu_affinity = *affinity;
-    *affinity = old_affinity;
-    if ( !cpu_isset(v->processor, v->cpu_affinity) )
+    cpumask_copy(&old_affinity, v->cpu_affinity);
+    cpumask_copy(v->cpu_affinity, affinity);
+    cpumask_copy(affinity, &old_affinity);
+    if ( !cpumask_test_cpu(v->processor, v->cpu_affinity) )
         set_bit(_VPF_migrating, &v->pause_flags);
 
     vcpu_schedule_unlock_irq(v);
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -68,7 +68,7 @@ struct vmx_domain {
         };
         u64 eptp;
     } ept_control;
-    cpumask_t ept_synced;
+    cpumask_var_t ept_synced;
 };
 
 #define ept_get_wl(d)   \
--- a/xen/include/xen/cpumask.h
+++ b/xen/include/xen/cpumask.h
@@ -81,24 +81,26 @@
 
 typedef struct cpumask{ DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;
 
-#define cpu_set(cpu, dst) __cpu_set((cpu), &(dst))
-static inline void __cpu_set(int cpu, volatile cpumask_t *dstp)
+#define cpu_set(cpu, dst) cpumask_set_cpu(cpu, &(dst))
+static inline void cpumask_set_cpu(int cpu, volatile cpumask_t *dstp)
 {
 	set_bit(cpu, dstp->bits);
 }
 
-#define cpu_clear(cpu, dst) __cpu_clear((cpu), &(dst))
-static inline void __cpu_clear(int cpu, volatile cpumask_t *dstp)
+#define cpu_clear(cpu, dst) cpumask_clear_cpu(cpu, &(dst))
+static inline void cpumask_clear_cpu(int cpu, volatile cpumask_t *dstp)
 {
 	clear_bit(cpu, dstp->bits);
 }
 
+#define cpumask_setall(dst) __cpus_setall(dst, NR_CPUS)
 #define cpus_setall(dst) __cpus_setall(&(dst), NR_CPUS)
 static inline void __cpus_setall(cpumask_t *dstp, int nbits)
 {
 	bitmap_fill(dstp->bits, nbits);
 }
 
+#define cpumask_clear(dst) __cpus_clear(dst, NR_CPUS)
 #define cpus_clear(dst) __cpus_clear(&(dst), NR_CPUS)
 static inline void __cpus_clear(cpumask_t *dstp, int nbits)
 {
@@ -109,18 +111,21 @@ static inline void __cpus_clear(cpumask_
 #define cpumask_test_cpu(cpu, cpumask) test_bit(cpu, (cpumask)->bits)
 #define cpu_isset(cpu, cpumask) test_bit((cpu), (cpumask).bits)
 
-#define cpu_test_and_set(cpu, cpumask) __cpu_test_and_set((cpu), &(cpumask))
-static inline int __cpu_test_and_set(int cpu, cpumask_t *addr)
+#define cpu_test_and_set(cpu, cpumask) \
+	cpumask_test_and_set_cpu(cpu, &(cpumask))
+static inline int cpumask_test_and_set_cpu(int cpu, cpumask_t *addr)
 {
 	return test_and_set_bit(cpu, addr->bits);
 }
 
-#define cpu_test_and_clear(cpu, cpumask) __cpu_test_and_clear((cpu), &(cpumask))
-static inline int __cpu_test_and_clear(int cpu, cpumask_t *addr)
+#define cpu_test_and_clear(cpu, cpumask) \
+	cpumask_test_and_clear_cpu(cpu, &(cpumask))
+static inline int cpumask_test_and_clear_cpu(int cpu, cpumask_t *addr)
 {
 	return test_and_clear_bit(cpu, addr->bits);
 }
 
+#define cpumask_and(dst, src1, src2) __cpus_and(dst, src1, src2, NR_CPUS)
 #define cpus_and(dst, src1, src2) __cpus_and(&(dst), &(src1), &(src2), NR_CPUS)
 static inline void __cpus_and(cpumask_t *dstp, const cpumask_t *src1p,
 					const cpumask_t *src2p, int nbits)
@@ -128,6 +133,7 @@ static inline void __cpus_and(cpumask_t 
 	bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits);
 }
 
+#define cpumask_or(dst, src1, src2) __cpus_or(dst, src1, src2, NR_CPUS)
 #define cpus_or(dst, src1, src2) __cpus_or(&(dst), &(src1), &(src2), NR_CPUS)
 static inline void __cpus_or(cpumask_t *dstp, const cpumask_t *src1p,
 					const cpumask_t *src2p, int nbits)
@@ -135,6 +141,7 @@ static inline void __cpus_or(cpumask_t *
 	bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits);
 }
 
+#define cpumask_xor(dst, src1, src2) __cpus_xor(dst, src1, src2, NR_CPUS)
 #define cpus_xor(dst, src1, src2) __cpus_xor(&(dst), &(src1), &(src2), NR_CPUS)
 static inline void __cpus_xor(cpumask_t *dstp, const cpumask_t *src1p,
 					const cpumask_t *src2p, int nbits)
@@ -142,6 +149,7 @@ static inline void __cpus_xor(cpumask_t 
 	bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits);
 }
 
+#define cpumask_andnot(dst, src1, src2) __cpus_andnot(dst, src1, src2, NR_CPUS)
 #define cpus_andnot(dst, src1, src2) \
 				__cpus_andnot(&(dst), &(src1), &(src2), NR_CPUS)
 static inline void __cpus_andnot(cpumask_t *dstp, const cpumask_t *src1p,
@@ -150,6 +158,7 @@ static inline void __cpus_andnot(cpumask
 	bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits);
 }
 
+#define cpumask_complement(dst, src) __cpus_complement(dst, src, NR_CPUS)
 #define cpus_complement(dst, src) __cpus_complement(&(dst), &(src), NR_CPUS)
 static inline void __cpus_complement(cpumask_t *dstp,
 					const cpumask_t *srcp, int nbits)
@@ -186,6 +195,7 @@ static inline int __cpus_empty(const cpu
 	return bitmap_empty(srcp->bits, nbits);
 }
 
+#define cpumask_full(cpumask) __cpus_full(cpumask, NR_CPUS)
 #define cpus_full(cpumask) __cpus_full(&(cpumask), NR_CPUS)
 static inline int __cpus_full(const cpumask_t *srcp, int nbits)
 {
@@ -199,8 +209,8 @@ static inline int __cpus_weight(const cp
 	return bitmap_weight(srcp->bits, nbits);
 }
 
-#define cpus_copy(dest, src) __cpus_copy(&(dest), &(src))
-static inline void __cpus_copy(cpumask_t *dstp, const cpumask_t *srcp)
+#define cpus_copy(dest, src) cpumask_copy(&(dest), &(src))
+static inline void cpumask_copy(cpumask_t *dstp, const cpumask_t *srcp)
 {
 	bitmap_copy(dstp->bits, srcp->bits, NR_CPUS);
 }
@@ -322,6 +332,57 @@ static inline int __cpulist_scnprintf(ch
 	return bitmap_scnlistprintf(buf, len, srcp->bits, nbits);
 }
 
+/*
+ * cpumask_var_t: struct cpumask for stack usage.
+ *
+ * Oh, the wicked games we play!  In order to make kernel coding a
+ * little more difficult, we typedef cpumask_var_t to an array or a
+ * pointer: doing &mask on an array is a noop, so it still works.
+ *
+ * ie.
+ *	cpumask_var_t tmpmask;
+ *	if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
+ *		return -ENOMEM;
+ *
+ *	  ... use 'tmpmask' like a normal struct cpumask * ...
+ *
+ *	free_cpumask_var(tmpmask);
+ */
+#if NR_CPUS > 2 * BITS_PER_LONG
+#include <xen/xmalloc.h>
+
+typedef cpumask_t *cpumask_var_t;
+
+static inline bool_t alloc_cpumask_var(cpumask_var_t *mask)
+{
+	return (*mask = xmalloc(cpumask_t)) != NULL;
+}
+
+static inline void free_cpumask_var(cpumask_var_t mask)
+{
+	xfree(mask);
+}
+#else
+typedef cpumask_t cpumask_var_t[1];
+
+static inline bool_t alloc_cpumask_var(cpumask_var_t *mask)
+{
+	return 1;
+}
+
+static inline void free_cpumask_var(cpumask_var_t mask)
+{
+}
+#endif
+
+static inline bool_t zalloc_cpumask_var(cpumask_var_t *mask)
+{
+	if (!alloc_cpumask_var(mask))
+		return 0;
+	cpumask_clear(*mask);
+	return 1;
+}
+
 #if NR_CPUS > 1
 #define for_each_cpu_mask(cpu, mask)		\
 	for ((cpu) = first_cpu(mask);		\
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -161,12 +161,12 @@ struct vcpu 
     spinlock_t       virq_lock;
 
     /* Bitmask of CPUs on which this VCPU may run. */
-    cpumask_t        cpu_affinity;
+    cpumask_var_t    cpu_affinity;
     /* Used to change affinity temporarily. */
-    cpumask_t        cpu_affinity_tmp;
+    cpumask_var_t    cpu_affinity_tmp;
 
     /* Bitmask of CPUs which are holding onto this VCPU's state. */
-    cpumask_t        vcpu_dirty_cpumask;
+    cpumask_var_t    vcpu_dirty_cpumask;
 
     /* Tasklet for continue_hypercall_on_cpu(). */
     struct tasklet   continue_hypercall_tasklet;
@@ -289,7 +289,7 @@ struct domain
     struct vcpu    **vcpu;
 
     /* Bitmask of CPUs which are holding onto this domain's state. */
-    cpumask_t        domain_dirty_cpumask;
+    cpumask_var_t    domain_dirty_cpumask;
 
     struct arch_domain arch;
 
@@ -641,7 +641,7 @@ void watchdog_domain_destroy(struct doma
 #define is_hvm_domain(d) ((d)->is_hvm)
 #define is_hvm_vcpu(v)   (is_hvm_domain(v->domain))
 #define is_pinned_vcpu(v) ((v)->domain->is_pinned || \
-                           cpus_weight((v)->cpu_affinity) == 1)
+                           cpumask_weight((v)->cpu_affinity) == 1)
 #define need_iommu(d)    ((d)->need_iommu)
 
 void set_vcpu_migration_delay(unsigned int delay);