WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] [PATCH, RFC] x86: make the GDT per-CPU

To: <xen-devel@xxxxxxxxxxxxxxxxxxx>
Subject: [Xen-devel] [PATCH, RFC] x86: make the GDT per-CPU
From: "Jan Beulich" <jbeulich@xxxxxxxxxx>
Date: Wed, 10 Sep 2008 15:35:40 +0100
Delivery-date: Wed, 10 Sep 2008 07:36:01 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
The major issue with supporting a significantly larger number of physical
CPUs appears to be the use of per-CPU GDT entries - at present, x86-64
could support only up to 126 CPUs (with code changes to also use the
top-most GDT page, that would be 254). Instead of trying to go with
incremental steps here, by converting the GDT itself to be per-CPU,
limitations in that respect go away entirely.

There's one particular part of it that I'm not very happy with, but have
had no better idea so far: In the general case, it is now necessary to
reload the GDT twice during context switch. Hence I'd appreciate ideas
on how to avoid this and stay with a single reload.

The patch has several debug items in it (which are marked as such), so is
in no case intended to go in as-is.

Jan

Index: 2008-09-01/xen/arch/x86/boot/wakeup.S
===================================================================
--- 2008-09-01.orig/xen/arch/x86/boot/wakeup.S  2008-09-10 13:43:04.000000000 
+0200
+++ 2008-09-01/xen/arch/x86/boot/wakeup.S       2008-09-09 10:44:30.000000000 
+0200
@@ -168,7 +168,7 @@ wakeup_32:
         .word   0,0,0
 lgdt_descr:
         .word   LAST_RESERVED_GDT_BYTE
-        .quad   gdt_table - FIRST_RESERVED_GDT_BYTE
+        .quad   boot_cpu_gdt_table - FIRST_RESERVED_GDT_BYTE
         
 wakeup_64:
         lgdt    lgdt_descr(%rip)
Index: 2008-09-01/xen/arch/x86/boot/x86_32.S
===================================================================
--- 2008-09-01.orig/xen/arch/x86/boot/x86_32.S  2008-09-10 13:43:04.000000000 
+0200
+++ 2008-09-01/xen/arch/x86/boot/x86_32.S       2008-09-09 14:45:58.000000000 
+0200
@@ -78,7 +78,7 @@ idt_descr:
         .word   0
 gdt_descr:
         .word   LAST_RESERVED_GDT_BYTE
-        .long   gdt_table - FIRST_RESERVED_GDT_BYTE
+        .long   boot_cpu_gdt_table - FIRST_RESERVED_GDT_BYTE
 
 
         .align 32
@@ -94,7 +94,7 @@ ENTRY(idle_pg_table)
 #define GUEST_DESC(d)                                                   \
         .long ((MACH2PHYS_VIRT_END - 1) >> 12) & 0xffff,                \
               ((MACH2PHYS_VIRT_END - 1) >> 12) & (0xf << 16) | (d)
-ENTRY(gdt_table)
+ENTRY(boot_cpu_gdt_table)
         .quad 0x0000000000000000     /* unused */
         .quad 0x00cf9a000000ffff     /* 0xe008 ring 0 4.00GB code at 0x0 */
         .quad 0x00cf92000000ffff     /* 0xe010 ring 0 4.00GB data at 0x0 */
@@ -102,4 +102,6 @@ ENTRY(gdt_table)
         GUEST_DESC(0x00c0b200)       /* 0xe021 ring 1 3.xxGB data at 0x0 */
         GUEST_DESC(0x00c0fa00)       /* 0xe02b ring 3 3.xxGB code at 0x0 */
         GUEST_DESC(0x00c0f200)       /* 0xe033 ring 3 3.xxGB data at 0x0 */
+        .fill (PER_CPU_GDT_ENTRY - FLAT_RING3_DS / 8 - 1), 8, 0
+        .quad 0x0000910000000000     /* per-CPU entry (limit == cpu) */
         .align PAGE_SIZE,0
Index: 2008-09-01/xen/arch/x86/boot/x86_64.S
===================================================================
--- 2008-09-01.orig/xen/arch/x86/boot/x86_64.S  2008-09-10 13:43:04.000000000 
+0200
+++ 2008-09-01/xen/arch/x86/boot/x86_64.S       2008-09-09 14:45:08.000000000 
+0200
@@ -85,7 +85,7 @@ multiboot_ptr:
         .word   0
 gdt_descr:
         .word   LAST_RESERVED_GDT_BYTE
-        .quad   gdt_table - FIRST_RESERVED_GDT_BYTE
+        .quad   boot_cpu_gdt_table - FIRST_RESERVED_GDT_BYTE
 
         .word   0,0,0
 idt_descr:
@@ -96,7 +96,7 @@ ENTRY(stack_start)
         .quad   cpu0_stack
 
         .align PAGE_SIZE, 0
-ENTRY(gdt_table)
+ENTRY(boot_cpu_gdt_table)
         .quad 0x0000000000000000     /* unused */
         .quad 0x00af9a000000ffff     /* 0xe008 ring 0 code, 64-bit mode   */
         .quad 0x00cf92000000ffff     /* 0xe010 ring 0 data                */
@@ -105,11 +105,13 @@ ENTRY(gdt_table)
         .quad 0x00cff2000000ffff     /* 0xe02b ring 3 data                */
         .quad 0x00affa000000ffff     /* 0xe033 ring 3 code, 64-bit mode   */
         .quad 0x00cf9a000000ffff     /* 0xe038 ring 0 code, compatibility */
+        .fill (PER_CPU_GDT_ENTRY - __HYPERVISOR_CS32 / 8 - 1), 8, 0
+        .quad 0x0000910000000000     /* per-CPU entry (limit == cpu)      */
 
         .align PAGE_SIZE, 0
 /* NB. Even rings != 0 get access to the full 4Gb, as only the            */
 /*     (compatibility) machine->physical mapping table lives there.       */
-ENTRY(compat_gdt_table)
+ENTRY(boot_cpu_compat_gdt_table)
         .quad 0x0000000000000000     /* unused */
         .quad 0x00af9a000000ffff     /* 0xe008 ring 0 code, 64-bit mode   */
         .quad 0x00cf92000000ffff     /* 0xe010 ring 0 data                */
@@ -118,4 +120,6 @@ ENTRY(compat_gdt_table)
         .quad 0x00cffa000000ffff     /* 0xe02b ring 3 code, compatibility */
         .quad 0x00cff2000000ffff     /* 0xe033 ring 3 data                */
         .quad 0x00cf9a000000ffff     /* 0xe038 ring 0 code, compatibility */
+        .fill (PER_CPU_GDT_ENTRY - __HYPERVISOR_CS32 / 8 - 1), 8, 0
+        .quad 0x0000910000000000     /* per-CPU entry (limit == cpu)      */
         .align PAGE_SIZE, 0
Index: 2008-09-01/xen/arch/x86/cpu/common.c
===================================================================
--- 2008-09-01.orig/xen/arch/x86/cpu/common.c   2008-09-10 13:43:04.000000000 
+0200
+++ 2008-09-01/xen/arch/x86/cpu/common.c        2008-09-10 16:09:18.000000000 
+0200
@@ -575,6 +575,9 @@ void __cpuinit cpu_init(void)
        if (cpu_has_pat)
                wrmsrl(MSR_IA32_CR_PAT, host_pat);
 
+       /* Install correct page table. */
+       write_ptbase(current);
+
        *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
        *(unsigned long  *)(&gdt_load[2]) = GDT_VIRT_START(current);
        asm volatile ( "lgdt %0" : "=m" (gdt_load) );
@@ -605,9 +608,6 @@ void __cpuinit cpu_init(void)
 #define CD(register) asm volatile ( "mov %0,%%db" #register : : "r"(0UL) );
        CD(0); CD(1); CD(2); CD(3); /* no db4 and db5 */; CD(6); CD(7);
 #undef CD
-
-       /* Install correct page table. */
-       write_ptbase(current);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
Index: 2008-09-01/xen/arch/x86/domain.c
===================================================================
--- 2008-09-01.orig/xen/arch/x86/domain.c       2008-09-10 13:43:04.000000000 
+0200
+++ 2008-09-01/xen/arch/x86/domain.c    2008-09-10 15:10:05.000000000 +0200
@@ -205,7 +205,6 @@ static inline int may_switch_mode(struct
 
 int switch_native(struct domain *d)
 {
-    l1_pgentry_t gdt_l1e;
     unsigned int vcpuid;
 
     if ( d == NULL )
@@ -217,12 +216,8 @@ int switch_native(struct domain *d)
 
     d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
 
-    /* switch gdt */
-    gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
     for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
     {
-        d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
-                                 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
         if (d->vcpu[vcpuid])
             release_compat_l4(d->vcpu[vcpuid]);
     }
@@ -232,7 +227,6 @@ int switch_native(struct domain *d)
 
 int switch_compat(struct domain *d)
 {
-    l1_pgentry_t gdt_l1e;
     unsigned int vcpuid;
 
     if ( d == NULL )
@@ -244,15 +238,11 @@ int switch_compat(struct domain *d)
 
     d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1;
 
-    /* switch gdt */
-    gdt_l1e = l1e_from_page(virt_to_page(compat_gdt_table), PAGE_HYPERVISOR);
     for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
     {
         if ( (d->vcpu[vcpuid] != NULL) &&
              (setup_compat_l4(d->vcpu[vcpuid]) != 0) )
             goto undo_and_fail;
-        d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
-                                 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
     }
 
     domain_set_alloc_bitsize(d);
@@ -261,13 +251,10 @@ int switch_compat(struct domain *d)
 
  undo_and_fail:
     d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 0;
-    gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
     while ( vcpuid-- != 0 )
     {
         if ( d->vcpu[vcpuid] != NULL )
             release_compat_l4(d->vcpu[vcpuid]);
-        d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
-                                 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
     }
     return -ENOMEM;
 }
@@ -315,7 +302,13 @@ int vcpu_initialise(struct vcpu *v)
         if ( is_idle_domain(d) )
         {
             v->arch.schedule_tail = continue_idle_domain;
-            v->arch.cr3           = __pa(idle_pg_table);
+            if ( v->vcpu_id )
+                v->arch.cr3 = d->vcpu[0]->arch.cr3;
+            else if ( !*idle_vcpu )
+                v->arch.cr3 = __pa(idle_pg_table);
+            else if ( !(v->arch.cr3 = clone_idle_pagetable(v)) )
+                return -ENOMEM;
+else printk("new idle domain: CR3=%lx\n", v->arch.cr3);//temp
         }
 
         v->arch.guest_context.ctrlreg[4] =
@@ -342,8 +335,7 @@ int arch_domain_create(struct domain *d,
 #ifdef __x86_64__
     struct page_info *pg;
 #endif
-    l1_pgentry_t gdt_l1e;
-    int i, vcpuid, pdpt_order, paging_initialised = 0;
+    int i, pdpt_order, paging_initialised = 0;
     int rc = -ENOMEM;
 
     d->arch.hvm_domain.hap_enabled =
@@ -362,18 +354,6 @@ int arch_domain_create(struct domain *d,
         goto fail;
     memset(d->arch.mm_perdomain_pt, 0, PAGE_SIZE << pdpt_order);
 
-    /*
-     * Map Xen segments into every VCPU's GDT, irrespective of whether every
-     * VCPU will actually be used. This avoids an NMI race during context
-     * switch: if we take an interrupt after switching CR3 but before switching
-     * GDT, and the old VCPU# is invalid in the new domain, we would otherwise
-     * try to load CS from an invalid table.
-     */
-    gdt_l1e = l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR);
-    for ( vcpuid = 0; vcpuid < MAX_VIRT_CPUS; vcpuid++ )
-        d->arch.mm_perdomain_pt[((vcpuid << GDT_LDT_VCPU_SHIFT) +
-                                 FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
-
 #if defined(__i386__)
 
     mapcache_domain_init(d);
@@ -1183,12 +1163,26 @@ static void paravirt_ctxt_switch_to(stru
     }
 }
 
+static void check_cpu(unsigned int cpu, int line) {//temp
+ unsigned int _cpu;
+ asm("lsll %1, %0" : "=r" (_cpu) : "rm" (PER_CPU_GDT_ENTRY << 3));
+ if(_cpu != cpu) {
+  struct desc_ptr gdt_desc;
+  asm("sgdt %0" : "=m" (gdt_desc));
+  printk("CPU#%u: wrong GDT (%lx->%u) at #%d\n", cpu, gdt_desc.base, _cpu, 
line);
+  show_page_walk(gdt_desc.base + FIRST_RESERVED_GDT_BYTE);
+ }
+}
+
 static void __context_switch(void)
 {
     struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
-    unsigned int          cpu = smp_processor_id();
+    unsigned int          i, cpu = smp_processor_id();
     struct vcpu          *p = per_cpu(curr_vcpu, cpu);
     struct vcpu          *n = current;
+    struct desc_struct   *gdt;
+    struct page_info     *page;
+    struct desc_ptr       gdt_desc;
 
     ASSERT(p != n);
     ASSERT(cpus_empty(n->vcpu_dirty_cpumask));
@@ -1214,14 +1208,35 @@ static void __context_switch(void)
         cpu_set(cpu, n->domain->domain_dirty_cpumask);
     cpu_set(cpu, n->vcpu_dirty_cpumask);
 
+check_cpu(cpu, __LINE__);//temp
+    gdt = !is_pv_32on64_vcpu(n) ? per_cpu(gdt_table, cpu) :
+                                  per_cpu(compat_gdt_table, cpu);
+    page = virt_to_page(gdt);
+    for (i = 0; i < NR_RESERVED_GDT_PAGES; ++i)
+    {
+        n->domain->arch.mm_perdomain_pt
+            [(n->vcpu_id << GDT_LDT_VCPU_SHIFT) +
+             FIRST_RESERVED_GDT_PAGE + i]
+            = l1e_from_page(page + i, __PAGE_HYPERVISOR);
+    }
+
+check_cpu(cpu, __LINE__);//temp
+    if ( p->vcpu_id != n->vcpu_id )
+    {
+        gdt_desc.limit = LAST_RESERVED_GDT_BYTE;
+        gdt_desc.base  = (unsigned long)(gdt - FIRST_RESERVED_GDT_ENTRY);
+        asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
+check_cpu(cpu, __LINE__);//temp
+    }
+
     write_ptbase(n);
 
+check_cpu(cpu, __LINE__);//temp
     if ( p->vcpu_id != n->vcpu_id )
     {
-        char gdt_load[10];
-        *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
-        *(unsigned long  *)(&gdt_load[2]) = GDT_VIRT_START(n);
-        asm volatile ( "lgdt %0" : "=m" (gdt_load) );
+        gdt_desc.base = GDT_VIRT_START(n);
+        asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
+check_cpu(cpu, __LINE__);//temp
     }
 
     if ( p->domain != n->domain )
@@ -1257,6 +1272,7 @@ void context_switch(struct vcpu *prev, s
 
     if ( (per_cpu(curr_vcpu, cpu) == next) || is_idle_vcpu(next) )
     {
+check_cpu(cpu, __LINE__);//temp
         local_irq_enable();
     }
     else
@@ -1272,8 +1288,6 @@ void context_switch(struct vcpu *prev, s
             uint64_t efer = read_efer();
             if ( !(efer & EFER_SCE) )
                 write_efer(efer | EFER_SCE);
-            flush_tlb_one_local(GDT_VIRT_START(next) +
-                                FIRST_RESERVED_GDT_BYTE);
         }
 #endif
 
Index: 2008-09-01/xen/arch/x86/domain_build.c
===================================================================
--- 2008-09-01.orig/xen/arch/x86/domain_build.c 2008-09-10 13:43:04.000000000 
+0200
+++ 2008-09-01/xen/arch/x86/domain_build.c      2008-09-09 11:01:01.000000000 
+0200
@@ -313,24 +313,11 @@ int __init construct_dom0(
 #if defined(__x86_64__)
     if ( compat32 )
     {
-        l1_pgentry_t gdt_l1e;
-
         d->arch.is_32bit_pv = d->arch.has_32bit_shinfo = 1;
         v->vcpu_info = (void *)&d->shared_info->compat.vcpu_info[0];
 
         if ( nr_pages != (unsigned int)nr_pages )
             nr_pages = UINT_MAX;
-
-        /*
-         * Map compatibility Xen segments into every VCPU's GDT. See
-         * arch_domain_create() for further comments.
-         */
-        gdt_l1e = l1e_from_page(virt_to_page(compat_gdt_table),
-                                PAGE_HYPERVISOR);
-        for ( i = 0; i < MAX_VIRT_CPUS; i++ )
-            d->arch.mm_perdomain_pt[((i << GDT_LDT_VCPU_SHIFT) +
-                                     FIRST_RESERVED_GDT_PAGE)] = gdt_l1e;
-        flush_tlb_one_local(GDT_LDT_VIRT_START + FIRST_RESERVED_GDT_BYTE);
     }
 #endif
 
Index: 2008-09-01/xen/arch/x86/hvm/vmx/vmcs.c
===================================================================
--- 2008-09-01.orig/xen/arch/x86/hvm/vmx/vmcs.c 2008-09-10 13:43:04.000000000 
+0200
+++ 2008-09-01/xen/arch/x86/hvm/vmx/vmcs.c      2008-09-09 14:09:36.000000000 
+0200
@@ -446,7 +446,7 @@ static void vmx_set_host_env(struct vcpu
 
     __vmwrite(HOST_IDTR_BASE, (unsigned long)idt_tables[cpu]);
 
-    __vmwrite(HOST_TR_SELECTOR, __TSS(cpu) << 3);
+    __vmwrite(HOST_TR_SELECTOR, TSS_ENTRY << 3);
     __vmwrite(HOST_TR_BASE, (unsigned long)&init_tss[cpu]);
 
     __vmwrite(HOST_SYSENTER_ESP, get_stack_bottom());
Index: 2008-09-01/xen/arch/x86/setup.c
===================================================================
--- 2008-09-01.orig/xen/arch/x86/setup.c        2008-09-10 13:43:04.000000000 
+0200
+++ 2008-09-01/xen/arch/x86/setup.c     2008-09-10 15:23:25.000000000 +0200
@@ -115,6 +115,12 @@ extern void early_cpu_init(void);
 extern void vesa_init(void);
 extern void vesa_mtrr_init(void);
 
+DEFINE_PER_CPU(struct desc_struct *, gdt_table) = boot_cpu_gdt_table;
+#ifdef CONFIG_COMPAT
+DEFINE_PER_CPU(struct desc_struct *, compat_gdt_table)
+    = boot_cpu_compat_gdt_table;
+#endif
+
 struct tss_struct init_tss[NR_CPUS];
 
 char __attribute__ ((__section__(".bss.stack_aligned"))) 
cpu0_stack[STACK_SIZE];
@@ -224,6 +230,7 @@ static void __init percpu_init_areas(voi
 static void __init init_idle_domain(void)
 {
     struct domain *idle_domain;
+    unsigned int i;
 
     /* Domain creation requires that scheduler structures are initialised. */
     scheduler_init();
@@ -236,6 +243,12 @@ static void __init init_idle_domain(void
     idle_vcpu[0] = this_cpu(curr_vcpu) = current;
 
     setup_idle_pagetable();
+
+    for (i = 0; i < NR_RESERVED_GDT_PAGES; ++i)
+        idle_domain->arch.mm_perdomain_pt[FIRST_RESERVED_GDT_PAGE + i] =
+            l1e_from_page(virt_to_page(boot_cpu_gdt_table) + i,
+                          __PAGE_HYPERVISOR);
+
 }
 
 static void __init srat_detect_node(int cpu)
@@ -443,7 +456,6 @@ void __init __start_xen(unsigned long mb
     parse_video_info();
 
     set_current((struct vcpu *)0xfffff000); /* debug sanity */
-    idle_vcpu[0] = current;
     set_processor_id(0); /* needed early, for smp_processor_id() */
     if ( cpu_has_efer )
         rdmsrl(MSR_EFER, this_cpu(efer));
Index: 2008-09-01/xen/arch/x86/smpboot.c
===================================================================
--- 2008-09-01.orig/xen/arch/x86/smpboot.c      2008-09-10 12:34:16.000000000 
+0200
+++ 2008-09-01/xen/arch/x86/smpboot.c   2008-09-10 13:43:56.000000000 +0200
@@ -835,10 +835,15 @@ static int __devinit do_boot_cpu(int api
  */
 {
        unsigned long boot_error;
+       unsigned int i;
        int timeout;
        unsigned long start_eip;
        unsigned short nmi_high = 0, nmi_low = 0;
        struct vcpu *v;
+       struct desc_struct *gdt;
+#ifdef __x86_64__
+        struct page_info *page;
+#endif
 
        /*
         * Save current MTRR state in case it was changed since early boot
@@ -864,6 +869,38 @@ static int __devinit do_boot_cpu(int api
        /* Debug build: detect stack overflow by setting up a guard page. */
        memguard_guard_stack(stack_start.esp);
 
+       gdt = per_cpu(gdt_table, cpu);
+       if (gdt == boot_cpu_gdt_table) {
+               i = get_order_from_pages(NR_RESERVED_GDT_PAGES);
+#ifdef __x86_64__
+#ifdef CONFIG_COMPAT
+               page = alloc_domheap_pages(NULL, i,
+                                          MEMF_node(cpu_to_node(cpu)));
+               per_cpu(compat_gdt_table, cpu) = gdt = page_to_virt(page);
+               memcpy(gdt, boot_cpu_compat_gdt_table,
+                      NR_RESERVED_GDT_PAGES * PAGE_SIZE);
+               gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
+#endif
+               page = alloc_domheap_pages(NULL, i,
+                                          MEMF_node(cpu_to_node(cpu)));
+               per_cpu(gdt_table, cpu) = gdt = page_to_virt(page);
+#else
+               per_cpu(gdt_table, cpu) = gdt = alloc_xenheap_pages(i);
+#endif
+               memcpy(gdt, boot_cpu_gdt_table,
+                      NR_RESERVED_GDT_PAGES * PAGE_SIZE);
+               BUILD_BUG_ON(NR_CPUS > 0x10000);
+               gdt[PER_CPU_GDT_ENTRY - FIRST_RESERVED_GDT_ENTRY].a = cpu;
+printk("CPU#%d: GDT@%p[%p]\n", cpu, gdt, per_cpu(compat_gdt_table, cpu));//temp
+       }
+
+       for (i = 0; i < NR_RESERVED_GDT_PAGES; ++i)
+               v->domain->arch.mm_perdomain_pt
+                       [(v->vcpu_id << GDT_LDT_VCPU_SHIFT) +
+                        FIRST_RESERVED_GDT_PAGE + i]
+                       = l1e_from_page(virt_to_page(gdt) + i,
+                                       __PAGE_HYPERVISOR);
+
        /*
         * This grunge runs the startup process for
         * the targeted processor.
Index: 2008-09-01/xen/arch/x86/traps.c
===================================================================
--- 2008-09-01.orig/xen/arch/x86/traps.c        2008-09-10 13:43:04.000000000 
+0200
+++ 2008-09-01/xen/arch/x86/traps.c     2008-09-09 16:15:21.000000000 +0200
@@ -2692,6 +2692,13 @@ asmlinkage void do_general_protection(st
         return;
     }
 
+if(regs->error_code) {//temp
+ struct desc_ptr gdt_desc;
+ asm("sgdt %0" : "=m" (gdt_desc));
+ printk("CPU[%u] GDT@%lx [%lx,%x]\n", smp_processor_id(), GDT_VIRT_START(v), 
gdt_desc.base, gdt_desc.limit);
+ show_page_walk(GDT_VIRT_START(v) + regs->error_code);
+}
+
 #if defined(__i386__)
     if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) && 
          (regs->error_code == 0) && 
@@ -2961,13 +2968,13 @@ void set_intr_gate(unsigned int n, void 
 void set_tss_desc(unsigned int n, void *addr)
 {
     _set_tssldt_desc(
-        gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
+        per_cpu(gdt_table, n) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
         (unsigned long)addr,
         offsetof(struct tss_struct, __cacheline_filler) - 1,
         9);
 #ifdef CONFIG_COMPAT
     _set_tssldt_desc(
-        compat_gdt_table + __TSS(n) - FIRST_RESERVED_GDT_ENTRY,
+        per_cpu(compat_gdt_table, n) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
         (unsigned long)addr,
         offsetof(struct tss_struct, __cacheline_filler) - 1,
         11);
Index: 2008-09-01/xen/arch/x86/x86_32/mm.c
===================================================================
--- 2008-09-01.orig/xen/arch/x86/x86_32/mm.c    2008-09-10 13:43:04.000000000 
+0200
+++ 2008-09-01/xen/arch/x86/x86_32/mm.c 2008-09-10 16:04:08.000000000 +0200
@@ -135,6 +135,30 @@ void __init setup_idle_pagetable(void)
                                 __PAGE_HYPERVISOR));
 }
 
+unsigned long clone_idle_pagetable(struct vcpu *v)
+{
+    unsigned int i;
+    struct domain *d = v->domain;
+    l3_pgentry_t *l3_table = v->arch.pae_l3_cache.table[0];
+    l2_pgentry_t *l2_table = alloc_xenheap_page();
+
+    if ( !l2_table )
+        return 0;
+
+    memcpy(l3_table, idle_pg_table, L3_PAGETABLE_ENTRIES * sizeof(*l3_table));
+    l3_table[l3_table_offset(PERDOMAIN_VIRT_START)] =
+        l3e_from_page(virt_to_page(l2_table), _PAGE_PRESENT);
+
+    copy_page(l2_table, idle_pg_table_l2 +
+              l3_table_offset(PERDOMAIN_VIRT_START) * L2_PAGETABLE_ENTRIES);
+    for ( i = 0; i < PDPT_L2_ENTRIES; ++i )
+        l2_table[l2_table_offset(PERDOMAIN_VIRT_START) + i] =
+            l2e_from_page(virt_to_page(d->arch.mm_perdomain_pt) + i,
+                          __PAGE_HYPERVISOR);
+
+    return __pa(l3_table);
+}
+
 void __init zap_low_mappings(l2_pgentry_t *dom0_l2)
 {
     int i;
@@ -189,7 +213,7 @@ void __init subarch_init_memory(void)
     {
         /* Guest kernel runs in ring 0, not ring 1. */
         struct desc_struct *d;
-        d = &gdt_table[(FLAT_RING1_CS >> 3) - FIRST_RESERVED_GDT_ENTRY];
+        d = &boot_cpu_gdt_table[(FLAT_RING1_CS >> 3) - 
FIRST_RESERVED_GDT_ENTRY];
         d[0].b &= ~_SEGMENT_DPL;
         d[1].b &= ~_SEGMENT_DPL;
     }
Index: 2008-09-01/xen/arch/x86/x86_32/supervisor_mode_kernel.S
===================================================================
--- 2008-09-01.orig/xen/arch/x86/x86_32/supervisor_mode_kernel.S        
2008-09-10 13:43:04.000000000 +0200
+++ 2008-09-01/xen/arch/x86/x86_32/supervisor_mode_kernel.S     2008-09-09 
13:57:13.000000000 +0200
@@ -100,15 +100,10 @@ ENTRY(fixup_ring0_guest_stack)
         # %gs:%esi now points to the guest stack before the
         # interrupt/exception occured.
 
-        /*
-         * Reverse the __TSS macro, giving us the CPU number.
-         * The TSS for this cpu is at init_tss + ( cpu * 128 ).
-         */
-        str   %ecx
-        shrl  $3,%ecx                                   # Calculate GDT index 
for TSS.
-        subl  $(FIRST_RESERVED_GDT_ENTRY+8),%ecx        # %ecx = 2*cpu.
-        shll  $6,%ecx                                   # Each TSS entry is 
0x80 bytes
-        addl  $init_tss,%ecx                            # but we have 2*cpu 
from above.
+        movl  $PER_CPU_GDT_ENTRY*8,%ecx
+        lsll  %ecx,%ecx
+        shll  $7,%ecx                                   # Each TSS entry is 
0x80 bytes
+        addl  $init_tss,%ecx
 
         # Load Xen stack from TSS.
         movw  TSS_ss0(%ecx),%ax
Index: 2008-09-01/xen/arch/x86/x86_32/traps.c
===================================================================
--- 2008-09-01.orig/xen/arch/x86/x86_32/traps.c 2008-09-10 13:43:04.000000000 
+0200
+++ 2008-09-01/xen/arch/x86/x86_32/traps.c      2008-09-09 14:48:33.000000000 
+0200
@@ -197,13 +197,15 @@ static unsigned char doublefault_stack[D
 
 asmlinkage void do_double_fault(void)
 {
-    struct tss_struct *tss = &doublefault_tss;
-    unsigned int cpu = ((tss->back_link>>3)-__FIRST_TSS_ENTRY)>>1;
+    struct tss_struct *tss;
+    unsigned int cpu;
 
     watchdog_disable();
 
     console_force_unlock();
 
+    asm ( "lsll %1, %0" : "=r" (cpu) : "rm" (PER_CPU_GDT_ENTRY << 3) );
+
     /* Find information saved during fault and dump it to the console. */
     tss = &init_tss[cpu];
     printk("*** DOUBLE FAULT ***\n");
@@ -328,7 +330,7 @@ void __devinit subarch_percpu_traps_init
     tss->eflags = 2;
     tss->bitmap = IOBMP_INVALID_OFFSET;
     _set_tssldt_desc(
-        gdt_table + __DOUBLEFAULT_TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
+        boot_cpu_gdt_table + __DOUBLEFAULT_TSS_ENTRY - 
FIRST_RESERVED_GDT_ENTRY,
         (unsigned long)tss, 235, 9);
 
     set_task_gate(TRAP_double_fault, __DOUBLEFAULT_TSS_ENTRY<<3);
Index: 2008-09-01/xen/arch/x86/x86_64/mm.c
===================================================================
--- 2008-09-01.orig/xen/arch/x86/x86_64/mm.c    2008-09-10 13:43:04.000000000 
+0200
+++ 2008-09-01/xen/arch/x86/x86_64/mm.c 2008-09-10 15:51:37.000000000 +0200
@@ -21,6 +21,7 @@
 #include <xen/lib.h>
 #include <xen/init.h>
 #include <xen/mm.h>
+#include <xen/numa.h>
 #include <xen/sched.h>
 #include <xen/guest_access.h>
 #include <asm/current.h>
@@ -209,6 +210,24 @@ void __init setup_idle_pagetable(void)
                   __PAGE_HYPERVISOR));
 }
 
+unsigned long clone_idle_pagetable(struct vcpu *v)
+{
+    struct domain *d = v->domain;
+    struct page_info *page = alloc_domheap_page(NULL,
+                                                MEMF_node(vcpu_to_node(v)));
+    l4_pgentry_t *l4_table = page_to_virt(page);
+
+    if ( !page )
+        return 0;
+
+    copy_page(l4_table, idle_pg_table);
+    l4_table[l4_table_offset(PERDOMAIN_VIRT_START)] =
+        l4e_from_page(virt_to_page(d->arch.mm_perdomain_l3),
+                      __PAGE_HYPERVISOR);
+
+    return __pa(l4_table);
+}
+
 void __init zap_low_mappings(void)
 {
     BUG_ON(num_online_cpus() != 1);
Index: 2008-09-01/xen/arch/x86/x86_64/traps.c
===================================================================
--- 2008-09-01.orig/xen/arch/x86/x86_64/traps.c 2008-09-10 13:43:04.000000000 
+0200
+++ 2008-09-01/xen/arch/x86/x86_64/traps.c      2008-09-09 14:49:19.000000000 
+0200
@@ -217,15 +217,14 @@ void show_page_walk(unsigned long addr)
 asmlinkage void double_fault(void);
 asmlinkage void do_double_fault(struct cpu_user_regs *regs)
 {
-    unsigned int cpu, tr;
-
-    asm volatile ( "str %0" : "=r" (tr) );
-    cpu = ((tr >> 3) - __FIRST_TSS_ENTRY) >> 2;
+    unsigned int cpu;
 
     watchdog_disable();
 
     console_force_unlock();
 
+    asm ( "lsll %1, %0" : "=r" (cpu) : "rm" (PER_CPU_GDT_ENTRY << 3) );
+
     /* Find information saved during fault and dump it to the console. */
     printk("*** DOUBLE FAULT ***\n");
     print_xen_info();
Index: 2008-09-01/xen/common/domain.c
===================================================================
--- 2008-09-01.orig/xen/common/domain.c 2008-09-10 13:43:04.000000000 +0200
+++ 2008-09-01/xen/common/domain.c      2008-09-10 08:29:58.000000000 +0200
@@ -172,7 +172,7 @@ struct vcpu *alloc_idle_vcpu(unsigned in
 {
     struct domain *d;
     struct vcpu *v;
-    unsigned int vcpu_id = cpu_id % MAX_VIRT_CPUS;
+    unsigned int vcpu_id = cpu_id % 2;//temp MAX_VIRT_CPUS;
 
     if ( (v = idle_vcpu[cpu_id]) != NULL )
         return v;
Index: 2008-09-01/xen/include/asm-x86/desc.h
===================================================================
--- 2008-09-01.orig/xen/include/asm-x86/desc.h  2008-09-10 13:43:04.000000000 
+0200
+++ 2008-09-01/xen/include/asm-x86/desc.h       2008-09-09 14:32:27.000000000 
+0200
@@ -34,11 +34,9 @@
 #define FLAT_COMPAT_USER_CS   FLAT_COMPAT_RING3_CS
 #define FLAT_COMPAT_USER_SS   FLAT_COMPAT_RING3_SS
 
-#define __FIRST_TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8)
-#define __FIRST_LDT_ENTRY (__FIRST_TSS_ENTRY + 2)
-
-#define __TSS(n) (((n)<<2) + __FIRST_TSS_ENTRY)
-#define __LDT(n) (((n)<<2) + __FIRST_LDT_ENTRY)
+#define TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8)
+#define LDT_ENTRY (TSS_ENTRY + 2)
+#define PER_CPU_GDT_ENTRY (LDT_ENTRY + 2)
 
 #elif defined(__i386__)
 
@@ -51,17 +49,15 @@
 
 #define __DOUBLEFAULT_TSS_ENTRY FIRST_RESERVED_GDT_ENTRY
 
-#define __FIRST_TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8)
-#define __FIRST_LDT_ENTRY (__FIRST_TSS_ENTRY + 1)
-
-#define __TSS(n) (((n)<<1) + __FIRST_TSS_ENTRY)
-#define __LDT(n) (((n)<<1) + __FIRST_LDT_ENTRY)
+#define TSS_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8)
+#define LDT_ENTRY (TSS_ENTRY + 1)
+#define PER_CPU_GDT_ENTRY (LDT_ENTRY + 1)
 
 #endif
 
 #ifndef __ASSEMBLY__
 
-#define load_TR(n)  __asm__ __volatile__ ("ltr  %%ax" : : "a" (__TSS(n)<<3) )
+#define load_TR(n)  __asm__ __volatile__ ("ltr  %%ax" : : "a" (TSS_ENTRY<<3) )
 
 #if defined(__x86_64__)
 #define GUEST_KERNEL_RPL(d) (is_pv_32bit_domain(d) ? 1 : 3)
@@ -205,11 +201,19 @@ do {                                    
 
 #endif
 
-extern struct desc_struct gdt_table[];
+struct desc_ptr {
+       unsigned short limit;
+       unsigned long base;
+} __attribute__((__packed__)) ;
+
+extern struct desc_struct boot_cpu_gdt_table[];
+DECLARE_PER_CPU(struct desc_struct *, gdt_table);
 #ifdef CONFIG_COMPAT
-extern struct desc_struct compat_gdt_table[];
+extern struct desc_struct boot_cpu_compat_gdt_table[];
+DECLARE_PER_CPU(struct desc_struct *, compat_gdt_table);
 #else
-# define compat_gdt_table gdt_table
+# define boot_cpu_compat_gdt_table boot_cpu_gdt_table
+# define per_cpu__compat_gdt_table per_cpu__gdt_table
 #endif
 
 extern void set_intr_gate(unsigned int irq, void * addr);
Index: 2008-09-01/xen/include/asm-x86/ldt.h
===================================================================
--- 2008-09-01.orig/xen/include/asm-x86/ldt.h   2008-09-10 13:43:04.000000000 
+0200
+++ 2008-09-01/xen/include/asm-x86/ldt.h        2008-09-09 14:13:41.000000000 
+0200
@@ -6,7 +6,6 @@
 
 static inline void load_LDT(struct vcpu *v)
 {
-    unsigned int cpu;
     struct desc_struct *desc;
     unsigned long ents;
 
@@ -16,11 +15,11 @@ static inline void load_LDT(struct vcpu 
     }
     else
     {
-        cpu = smp_processor_id();
-        desc = (!is_pv_32on64_vcpu(v) ? gdt_table : compat_gdt_table)
-               + __LDT(cpu) - FIRST_RESERVED_GDT_ENTRY;
+        desc = (!is_pv_32on64_vcpu(v)
+                ? this_cpu(gdt_table) : this_cpu(compat_gdt_table))
+               + LDT_ENTRY - FIRST_RESERVED_GDT_ENTRY;
         _set_tssldt_desc(desc, LDT_VIRT_START(v), ents*8-1, 2);
-        __asm__ __volatile__ ( "lldt %%ax" : : "a" (__LDT(cpu)<<3) );
+        __asm__ __volatile__ ( "lldt %%ax" : : "a" (LDT_ENTRY << 3) );
     }
 }
 
Index: 2008-09-01/xen/include/asm-x86/page.h
===================================================================
--- 2008-09-01.orig/xen/include/asm-x86/page.h  2008-09-10 13:43:04.000000000 
+0200
+++ 2008-09-01/xen/include/asm-x86/page.h       2008-09-10 09:06:02.000000000 
+0200
@@ -278,6 +278,7 @@ extern unsigned int   m2p_compat_vstart;
 #endif
 void paging_init(void);
 void setup_idle_pagetable(void);
+unsigned long clone_idle_pagetable(struct vcpu *);
 #endif /* !defined(__ASSEMBLY__) */
 
 #define _PAGE_PRESENT  0x001U


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel