ChangeSet 1.1616, 2005/05/31 16:39:28+01:00, kaf24@xxxxxxxxxxxxxxxxxxxx
Subject: PAE support
This patch adds initial support for PAE paging to xen.
This patch does:
* boot Xen itself with PAE paging enabled.
* add PAE support to the dom0 domain builder.
Some notes on the design and the changes:
* There are two new config options: CONFIG_X86_PAE (boolean,
same name Linux uses to simply things) and
CONFIG_PAGING_LEVELS (int, possible values are 2,3,4). I've
used #if CONFIG_PAGING_LEVELS for stuff which simply depends
on the number of paging levels in the code common for
x86-32/64, and CONFIG_X86_PAE for special PAE quirks or
i386-only stuff. I've tried to avoid ifdefs if possible
though, often I rearranged code to make it work in both
PAE and non-PAE case instead.
* idle_pg_table: 3rd level is statically initialized, 2nd
level is contignous in physical and virtual memory, so it can
be addressed linear (the dom0 builder uses the same trick to
simplify things a bit btw.). There are two new symbols:
idle_pg_table_l3 and idle_pg_table_l2 for the two tables.
idle_pg_table is aliased to the toplevel page table, i.e.
idle_pg_table_l3 in PAE mode and idle_pg_table_l2 in non-pae
mode. The idle l3 table is actually never ever touched after
boot, the l2 table is accessed via idle_pg_table_l2 and
addressed linear in both PAE and non-PAE mode.
* I've added a "intpte_t" type and a PRIpte define, modeled
after the C99 inttypes.h header, for page table entries.
Signed-off-by: Gerd Knorr <kraxel@xxxxxxxxxxx>
arch/x86/audit.c | 4
arch/x86/boot/x86_32.S | 43 ++++
arch/x86/dom0_ops.c | 2
arch/x86/domain.c | 8
arch/x86/domain_build.c | 92 +++++++--
arch/x86/idle0_task.c | 3
arch/x86/mm.c | 348 ++++++++++++++++++++++++-----------
arch/x86/setup.c | 2
arch/x86/shadow.c | 51 +++--
arch/x86/traps.c | 2
arch/x86/vmx.c | 12 -
arch/x86/vmx_io.c | 6
arch/x86/vmx_vmcs.c | 4
arch/x86/x86_32/domain_page.c | 2
arch/x86/x86_32/mm.c | 140 +++++++++-----
arch/x86/x86_32/traps.c | 17 +
arch/x86/x86_64/mm.c | 2
include/asm-x86/config.h | 33 ++-
include/asm-x86/domain.h | 6
include/asm-x86/mm.h | 14 -
include/asm-x86/page.h | 151 +++++++++++++--
include/asm-x86/shadow.h | 6
include/asm-x86/smp.h | 7
include/asm-x86/types.h | 6
include/asm-x86/x86_32/page-2level.h | 49 ++++
include/asm-x86/x86_32/page-3level.h | 56 +++++
include/asm-x86/x86_32/page.h | 127 ------------
include/asm-x86/x86_64/page.h | 188 ++----------------
include/public/arch-x86_32.h | 6
29 files changed, 830 insertions(+), 557 deletions(-)
diff -Nru a/xen/arch/x86/audit.c b/xen/arch/x86/audit.c
--- a/xen/arch/x86/audit.c 2005-05-31 12:04:00 -04:00
+++ b/xen/arch/x86/audit.c 2005-05-31 12:04:00 -04:00
@@ -408,9 +408,9 @@
for_each_exec_domain(d, ed)
{
- if ( pagetable_val(ed->arch.guest_table) )
+ if ( pagetable_get_phys(ed->arch.guest_table) )
adjust(&frame_table[pagetable_get_pfn(ed->arch.guest_table)],
1);
- if ( pagetable_val(ed->arch.shadow_table) )
+ if ( pagetable_get_phys(ed->arch.shadow_table) )
adjust(&frame_table[pagetable_get_pfn(ed->arch.shadow_table)],
0);
if ( ed->arch.monitor_shadow_ref )
adjust(&frame_table[ed->arch.monitor_shadow_ref], 0);
diff -Nru a/xen/arch/x86/boot/x86_32.S b/xen/arch/x86/boot/x86_32.S
--- a/xen/arch/x86/boot/x86_32.S 2005-05-31 12:04:00 -04:00
+++ b/xen/arch/x86/boot/x86_32.S 2005-05-31 12:04:00 -04:00
@@ -101,6 +101,22 @@
xor %eax,%eax
rep stosb
+#ifdef CONFIG_X86_PAE
+ /* Initialize low and high mappings of all memory with 2MB pages */
+ mov $idle_pg_table_l2-__PAGE_OFFSET,%edi
+ mov $0xe3,%eax /* PRESENT+RW+A+D+2MB */
+1: mov %eax,__PAGE_OFFSET>>18(%edi) /* high mapping */
+ stosl /* low mapping */
+ add $4,%edi
+ add $(1<<L2_PAGETABLE_SHIFT),%eax
+ cmp $DIRECTMAP_PHYS_END+0xe3,%eax
+ jne 1b
+1: stosl /* low mappings cover as much physmem as possible */
+ add $4,%edi
+ add $(1<<L2_PAGETABLE_SHIFT),%eax
+ cmp $__HYPERVISOR_VIRT_START+0xe3,%eax
+ jne 1b
+#else
/* Initialize low and high mappings of all memory with 4MB pages */
mov $idle_pg_table-__PAGE_OFFSET,%edi
mov $0xe3,%eax /* PRESENT+RW+A+D+4MB */
@@ -113,6 +129,7 @@
add $(1<<L2_PAGETABLE_SHIFT),%eax
cmp $__HYPERVISOR_VIRT_START+0xe3,%eax
jne 1b
+#endif
/* Initialise IDT with simple error defaults. */
lea ignore_int,%edx
@@ -204,10 +221,17 @@
.quad 0x0000000000000000 /* unused */
.quad 0x00cf9a000000ffff /* 0xe008 ring 0 4.00GB code at 0x0 */
.quad 0x00cf92000000ffff /* 0xe010 ring 0 4.00GB data at 0x0 */
+#ifdef CONFIG_X86_PAE
+ .quad 0x00cfba00000067ff
+ .quad 0x00cfb200000067ff
+ .quad 0x00cffa00000067ff
+ .quad 0x00cff200000067ff
+#else
.quad 0x00cfba000000c3ff /* 0xe019 ring 1 3.95GB code at 0x0 */
.quad 0x00cfb2000000c3ff /* 0xe021 ring 1 3.95GB data at 0x0 */
.quad 0x00cffa000000c3ff /* 0xe02b ring 3 3.95GB code at 0x0 */
.quad 0x00cff2000000c3ff /* 0xe033 ring 3 3.95GB data at 0x0 */
+#endif
.quad 0x0000000000000000 /* unused */
.fill 2*NR_CPUS,8,0 /* space for TSS and LDT per CPU */
@@ -215,10 +239,27 @@
/* Maximum STACK_ORDER for x86/32 is 1. We must therefore ensure that the */
/* CPU0 stack is aligned on an even page boundary! */
ENTRY(cpu0_stack)
-
.org 0x2000 + STACK_SIZE
+
+#ifdef CONFIG_X86_PAE
+
ENTRY(idle_pg_table)
+ENTRY(idle_pg_table_l3)
+ .quad 0x100000 + 0x2000 + STACK_SIZE + 1*PAGE_SIZE + 0x01
+ .quad 0x100000 + 0x2000 + STACK_SIZE + 2*PAGE_SIZE + 0x01
+ .quad 0x100000 + 0x2000 + STACK_SIZE + 3*PAGE_SIZE + 0x01
+ .quad 0x100000 + 0x2000 + STACK_SIZE + 4*PAGE_SIZE + 0x01
+ .org 0x2000 + STACK_SIZE + 1*PAGE_SIZE
+ENTRY(idle_pg_table_l2)
+ .org 0x2000 + STACK_SIZE + 5*PAGE_SIZE
+
+#else /* CONFIG_X86_PAE */
+ENTRY(idle_pg_table)
+ENTRY(idle_pg_table_l2) # Initial page directory is 4kB
.org 0x2000 + STACK_SIZE + PAGE_SIZE
+
+#endif /* CONFIG_X86_PAE */
+
ENTRY(stext)
ENTRY(_stext)
diff -Nru a/xen/arch/x86/dom0_ops.c b/xen/arch/x86/dom0_ops.c
--- a/xen/arch/x86/dom0_ops.c 2005-05-31 12:04:00 -04:00
+++ b/xen/arch/x86/dom0_ops.c 2005-05-31 12:04:00 -04:00
@@ -405,7 +405,7 @@
c->flags |= VGCF_VMX_GUEST;
#endif
- c->pt_base = pagetable_val(ed->arch.guest_table);
+ c->pt_base = pagetable_get_phys(ed->arch.guest_table);
c->vm_assist = ed->domain->vm_assist;
}
diff -Nru a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c 2005-05-31 12:04:01 -04:00
+++ b/xen/arch/x86/domain.c 2005-05-31 12:04:01 -04:00
@@ -460,7 +460,7 @@
// trust the VMX domain builder. Xen should validate this
// page table, and/or build the table itself, or ???
//
- if ( !pagetable_val(d->arch.phys_table) )
+ if ( !pagetable_get_phys(d->arch.phys_table) )
d->arch.phys_table = ed->arch.guest_table;
if ( (error = vmx_final_setup_guest(ed, c)) )
@@ -660,7 +660,7 @@
struct exec_domain *ed = current;
if ( unlikely(copy_from_user(&stu, (void *)regs->rsp, sizeof(stu))) ||
- unlikely(pagetable_val(ed->arch.guest_table_user) == 0) )
+ unlikely(pagetable_get_phys(ed->arch.guest_table_user) == 0) )
return -EFAULT;
toggle_guest_mode(ed);
@@ -978,7 +978,7 @@
/* Drop the in-use references to page-table bases. */
for_each_exec_domain ( d, ed )
{
- if ( pagetable_val(ed->arch.guest_table) != 0 )
+ if ( pagetable_get_phys(ed->arch.guest_table) != 0 )
{
if ( shadow_mode_refcounts(d) )
put_page(&frame_table[pagetable_get_pfn(ed->arch.guest_table)]);
@@ -988,7 +988,7 @@
ed->arch.guest_table = mk_pagetable(0);
}
- if ( pagetable_val(ed->arch.guest_table_user) != 0 )
+ if ( pagetable_get_phys(ed->arch.guest_table_user) != 0 )
{
if ( shadow_mode_refcounts(d) )
put_page(&frame_table[pagetable_get_pfn(ed->arch.guest_table_user)]);
diff -Nru a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c
--- a/xen/arch/x86/domain_build.c 2005-05-31 12:04:00 -04:00
+++ b/xen/arch/x86/domain_build.c 2005-05-31 12:04:00 -04:00
@@ -44,15 +44,15 @@
#if defined(__i386__)
/* No ring-3 access in initial leaf page tables. */
#define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED)
+#define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
+#define L3_PROT (_PAGE_PRESENT)
#elif defined(__x86_64__)
/* Allow ring-3 access in long mode as guest cannot use ring 1. */
#define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER)
-#endif
-/* Don't change these: Linux expects just these bits to be set. */
-/* (And that includes the bogus _PAGE_DIRTY!) */
#define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
#define L3_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
#define L4_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER)
+#endif
#define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
#define round_pgdown(_p) ((_p)&PAGE_MASK)
@@ -91,7 +91,11 @@
#elif defined(__x86_64__)
char *image_start = __va(_image_start);
char *initrd_start = __va(_initrd_start);
+#endif
+#if CONFIG_PAGING_LEVELS >= 4
l4_pgentry_t *l4tab = NULL, *l4start = NULL;
+#endif
+#if CONFIG_PAGING_LEVELS >= 3
l3_pgentry_t *l3tab = NULL, *l3start = NULL;
#endif
l2_pgentry_t *l2tab = NULL, *l2start = NULL;
@@ -143,7 +147,7 @@
panic("Not enough RAM for DOM0 reservation.\n");
alloc_start = page_to_phys(page);
alloc_end = alloc_start + (d->tot_pages << PAGE_SHIFT);
-
+
if ( (rc = parseelfimage(&dsi)) != 0 )
return rc;
@@ -172,10 +176,15 @@
v_end = (vstack_end + (1UL<<22)-1) & ~((1UL<<22)-1);
if ( (v_end - vstack_end) < (512UL << 10) )
v_end += 1UL << 22; /* Add extra 4MB to get >= 512kB padding. */
-#if defined(__i386__)
+#if defined(__i386__) && !defined(CONFIG_X86_PAE)
if ( (((v_end - dsi.v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >>
L2_PAGETABLE_SHIFT) + 1) <= nr_pt_pages )
break;
+#elif defined(__i386__) && defined(CONFIG_X86_PAE)
+ /* 5 pages: 1x 3rd + 4x 2nd level */
+ if ( (((v_end - dsi.v_start + ((1UL<<L2_PAGETABLE_SHIFT)-1)) >>
+ L2_PAGETABLE_SHIFT) + 5) <= nr_pt_pages )
+ break;
#elif defined(__x86_64__)
#define NR(_l,_h,_s) \
(((((_h) + ((1UL<<(_s))-1)) & ~((1UL<<(_s))-1)) - \
@@ -249,6 +258,24 @@
}
/* WARNING: The new domain must have its 'processor' field filled in! */
+#if CONFIG_PAGING_LEVELS == 3
+ l3start = l3tab = (l3_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
+ l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += 4*PAGE_SIZE;
+ memcpy(l2tab, idle_pg_table_l2, 4*PAGE_SIZE);
+ for (i = 0; i < 4; i++) {
+ l3tab[i] = l3e_create_phys((u32)l2tab + i*PAGE_SIZE, L3_PROT);
+ l2tab[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT)+i] =
+ l2e_create_phys((u32)l2tab + i*PAGE_SIZE, __PAGE_HYPERVISOR);
+ }
+ unsigned long v;
+ for (v = PERDOMAIN_VIRT_START; v < PERDOMAIN_VIRT_END;
+ v += (1 << L2_PAGETABLE_SHIFT)) {
+ l2tab[v >> L2_PAGETABLE_SHIFT] =
+ l2e_create_phys(__pa(d->arch.mm_perdomain_pt) +
(v-PERDOMAIN_VIRT_START),
+ __PAGE_HYPERVISOR);
+ }
+ ed->arch.guest_table = mk_pagetable((unsigned long)l3start);
+#else
l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
memcpy(l2tab, &idle_pg_table[0], PAGE_SIZE);
l2tab[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
@@ -256,8 +283,9 @@
l2tab[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
l2e_create_phys(__pa(d->arch.mm_perdomain_pt), __PAGE_HYPERVISOR);
ed->arch.guest_table = mk_pagetable((unsigned long)l2start);
+#endif
- l2tab += l2_table_offset(dsi.v_start);
+ l2tab += l2_linear_offset(dsi.v_start);
mfn = alloc_start >> PAGE_SHIFT;
for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ )
{
@@ -282,8 +310,8 @@
}
/* Pages that are part of page tables must be read only. */
- l2tab = l2start + l2_table_offset(vpt_start);
- l1start = l1tab = (l1_pgentry_t *)l2e_get_phys(*l2tab);
+ l2tab = l2start + l2_linear_offset(vpt_start);
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
|